Skip to content

Commit c74aab7

Browse files
jamesbrazaCopilot
andauthored
Documenting why we drop embeddings (#995)
Co-authored-by: Copilot <[email protected]>
1 parent b9a0d76 commit c74aab7

File tree

3 files changed

+15
-5
lines changed

3 files changed

+15
-5
lines changed

paperqa/agents/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def strip_answer(
7272
) -> PQASession:
7373
# This modifies in place, this is fine
7474
# because when a response is being constructed,
75-
# we should be done with the Answer object
75+
# we should be done with the PQASession object
7676
v.filter_content_for_user()
7777
return v
7878

paperqa/core.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,12 @@ async def map_fxn_summary(
209209
context=context,
210210
question=question,
211211
text=Text(
212-
text=text.text,
213-
name=text.name,
212+
# Embeddings enable the retrieval of Texts to make Contexts.
213+
# Once we already have Contexts, we filter them by score
214+
# (and not the underlying Text's embeddings),
215+
# so embeddings can be safely dropped from the deepcopy
214216
doc=text.doc.model_dump(exclude={"embedding"}),
217+
**text.model_dump(exclude={"embedding", "doc"}),
215218
),
216219
score=score, # pylint: disable=possibly-used-before-assignment
217220
**extras,

paperqa/types.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -289,16 +289,23 @@ def get_unique_docs_from_contexts(self, score_threshold: int = 0) -> set[Doc]:
289289
}
290290

291291
def filter_content_for_user(self) -> None:
292-
"""Filter out extra items (inplace) that do not need to be returned to the user."""
292+
"""
293+
In-place filter/drop items that are irrelevant to the user.
294+
295+
This is mainly done to keep HTTP requests reasonably sized.
296+
"""
293297
self.contexts = [
294298
Context(
295299
# Dump all fields from the original context (including extras),
296300
# but exclude 'text' so we can replace it below.
297301
**c.model_dump(exclude={"text"}),
298302
text=Text(
299303
text="",
300-
**c.text.model_dump(exclude={"text", "embedding", "doc"}),
304+
# Similar to the explanation in `map_fxn_summary`'s internals
305+
# on why we drop embeddings, drop embeddings here too because
306+
# embeddings aren't displayed to front end users
301307
doc=c.text.doc.model_dump(exclude={"embedding"}),
308+
**c.text.model_dump(exclude={"text", "embedding", "doc"}),
302309
),
303310
)
304311
for c in self.contexts

0 commit comments

Comments
 (0)