Skip to content

Commit ad16646

Browse files
mskarlinCopilot
andauthored
Allow users to skip citation stripping from evidence (#1035)
Co-authored-by: Copilot <[email protected]>
1 parent a1571f4 commit ad16646

File tree

5 files changed

+20
-5
lines changed

5 files changed

+20
-5
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,7 @@ will return much faster than the first query and we'll be certain the authors ma
887887
| `answer.get_evidence_if_no_contexts` | `True` | Allow lazy evidence gathering. |
888888
| `answer.group_contexts_by_question` | `False` | Groups the final contexts by the underlying `gather_evidence` question in the final context prompt. |
889889
| `answer.evidence_relevance_score_cutoff` | `1` | Cutoff evidence relevance score to include in the answer context (inclusive) |
890+
| `answer.skip_evidence_citation_strip` | `False` | Skip removal of citations from the `gather_evidence` contexts |
890891
| `parsing.chunk_size` | `5000` | Characters per chunk (0 for no chunking). |
891892
| `parsing.page_size_limit` | `1,280,000` | Character limit per page. |
892893
| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. |

src/paperqa/core.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ async def map_fxn_summary(
135135
extra_prompt_data: dict[str, str] | None = None,
136136
parser: Callable[[str], dict[str, Any]] | None = None,
137137
callbacks: Sequence[Callable[[str], None]] | None = None,
138+
skip_citation_strip: bool = False,
138139
) -> tuple[Context, LLMResult]:
139140
"""Parses the given text and returns a context object with the parser and prompt runner.
140141
@@ -152,6 +153,7 @@ async def map_fxn_summary(
152153
parser: Optional parser function to parse LLM output into structured data.
153154
Should return dict with at least 'summary' field.
154155
callbacks: Optional sequence of callback functions to execute during LLM calls.
156+
skip_citation_strip: Optional skipping of citation stripping, if you want to keep in the context.
155157
156158
Returns:
157159
The context object and LLMResult to get info about the LLM execution.
@@ -206,7 +208,9 @@ async def map_fxn_summary(
206208
score = 5
207209
success = True
208210
# remove citations that collide with our grounded citations (for the answer LLM)
209-
context = strip_citations(context)
211+
if not skip_citation_strip:
212+
context = strip_citations(context)
213+
210214
if not success:
211215
score = extract_score(context)
212216

src/paperqa/docs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,7 @@ async def aget_evidence(
669669
},
670670
parser=llm_parse_json if prompt_config.use_json else None,
671671
callbacks=callbacks,
672+
skip_citation_strip=answer_config.skip_evidence_citation_strip,
672673
)
673674
for m in matches
674675
],

src/paperqa/settings.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,10 @@ class AnswerSettings(BaseModel):
124124
default=False,
125125
description="Whether to group contexts by question when generating answers.",
126126
)
127+
skip_evidence_citation_strip: bool = Field(
128+
default=False,
129+
description="Whether to skip stripping citations from evidence.",
130+
)
127131

128132
@model_validator(mode="after")
129133
def _deprecated_field(self) -> Self:

tests/test_paperqa.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,7 @@ async def test_aquery_groups_contexts_by_question(docs_fixture) -> None:
579579
session.contexts = [
580580
Context(
581581
text=text1,
582-
context="Explanation about XAI and molecules.",
582+
context="Explanation about XAI and molecules (Smith 1999).",
583583
score=6,
584584
question="Is XAI usable in chemistry?",
585585
),
@@ -599,7 +599,10 @@ async def test_aquery_groups_contexts_by_question(docs_fixture) -> None:
599599

600600
settings = Settings(
601601
prompts={"answer_iteration_prompt": None},
602-
answer={"group_contexts_by_question": True},
602+
answer={
603+
"group_contexts_by_question": True,
604+
"skip_evidence_citation_strip": True,
605+
},
603606
)
604607

605608
result = await docs_fixture.aquery(session, settings=settings)
@@ -616,7 +619,7 @@ async def test_aquery_groups_contexts_by_question(docs_fixture) -> None:
616619
in final_context_str
617620
)
618621

619-
assert "Explanation about XAI and molecules." in final_context_str
622+
assert "Explanation about XAI and molecules (Smith 1999)." in final_context_str
620623
assert "Details on how drug discovery leverages AI." in final_context_str
621624
assert "General facts about organic chemistry." in final_context_str
622625

@@ -627,7 +630,9 @@ async def test_aquery_groups_contexts_by_question(docs_fixture) -> None:
627630
q2_header_pos = final_context_str.find(
628631
'Contexts related to the question: "What is organic chemistry?"'
629632
)
630-
context1_pos = final_context_str.find("Explanation about XAI and molecules.")
633+
context1_pos = final_context_str.find(
634+
"Explanation about XAI and molecules (Smith 1999)."
635+
)
631636
context3_pos = final_context_str.find("General facts about organic chemistry.")
632637

633638
assert (

0 commit comments

Comments
 (0)