Skip to content

Commit d34543e

Browse files
authored
Documenting DocMetadataTask/MetadataProvider (#1050)
1 parent 948423f commit d34543e

File tree

2 files changed

+26
-13
lines changed

2 files changed

+26
-13
lines changed

src/paperqa/clients/__init__.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import aiohttp
99
from lmi.utils import gather_with_concurrency
10-
from pydantic import BaseModel, ConfigDict
10+
from pydantic import BaseModel, ConfigDict, Field
1111

1212
from paperqa.types import Doc, DocDetails
1313

@@ -36,21 +36,33 @@
3636

3737

3838
class DocMetadataTask(BaseModel):
39-
"""Holder for provider and processor tasks."""
39+
"""Simple container pairing metadata providers with processors."""
4040

4141
model_config = ConfigDict(arbitrary_types_allowed=True)
4242

43-
providers: Collection[MetadataProvider]
44-
processors: Collection[MetadataPostProcessor]
43+
providers: Collection[MetadataProvider] = Field(
44+
description=(
45+
"Metadata providers allotted to this task."
46+
" An example would be providers for Crossref and Semantic Scholar."
47+
)
48+
)
49+
processors: Collection[MetadataPostProcessor] = Field(
50+
description=(
51+
"Metadata post-processors allotted to this task."
52+
" An example would be a journal quality filter."
53+
)
54+
)
4555

4656
def provider_queries(
4757
self, query: dict
4858
) -> list[Coroutine[Any, Any, DocDetails | None]]:
59+
"""Set up query coroutines for each contained metadata provider."""
4960
return [p.query(query) for p in self.providers]
5061

5162
def processor_queries(
5263
self, doc_details: DocDetails, session: aiohttp.ClientSession
5364
) -> list[Coroutine[Any, Any, DocDetails]]:
65+
"""Set up process coroutines for each contained metadata post-processor."""
5466
return [
5567
p.process(copy.copy(doc_details), session=session) for p in self.processors
5668
]
@@ -78,7 +90,6 @@ def __init__(
7890
if nested, will query in order looking for termination criteria after each.
7991
Will terminate early if either DocDetails.is_hydration_needed is False OR if
8092
all requested fields are present in the DocDetails object.
81-
8293
"""
8394
self._session = session
8495
self.tasks: list[DocMetadataTask] = []

src/paperqa/clients/client_models.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,25 +88,28 @@ class JournalQuery(ClientQuery):
8888

8989

9090
class MetadataProvider(ABC, Generic[ClientQueryType]):
91-
"""Provide metadata from a query by any means necessary."""
91+
"""Provide metadata from a query by any means necessary.
92+
93+
An example is going from a DOI to full paper metadata using Semantic Scholar.
94+
"""
9295

9396
async def query(self, query: dict) -> DocDetails | None:
94-
return await self._query(self.query_transformer(query))
97+
return await self._query(self.query_factory(query))
9598

9699
@abstractmethod
97100
async def _query(self, query: ClientQueryType) -> DocDetails | None:
98-
pass
101+
"""Run a query against the provider."""
99102

100103
@abstractmethod
101-
def query_transformer(self, query: dict) -> ClientQueryType:
102-
pass
104+
def query_factory(self, query: dict) -> ClientQueryType:
105+
"""Create a query object from unstructured query data."""
103106

104107

105108
class DOIOrTitleBasedProvider(MetadataProvider[DOIQuery | TitleAuthorQuery]):
106109

107110
async def query(self, query: dict) -> DocDetails | None:
108111
try:
109-
client_query = self.query_transformer(query)
112+
client_query = self.query_factory(query)
110113
return await self._query(client_query)
111114
# We allow graceful failures, i.e. return "None" for both DOI errors and timeout errors
112115
# DOINotFoundError means the paper doesn't exist in the source, the timeout is to prevent
@@ -150,7 +153,7 @@ async def _query(self, query: DOIQuery | TitleAuthorQuery) -> DocDetails | None:
150153
TimeoutError: When the request takes too long on the client side
151154
"""
152155

153-
def query_transformer(self, query: dict) -> DOIQuery | TitleAuthorQuery:
156+
def query_factory(self, query: dict) -> DOIQuery | TitleAuthorQuery:
154157
try:
155158
if "doi" in query:
156159
return DOIQuery(**query)
@@ -169,7 +172,6 @@ class MetadataPostProcessor(ABC, Generic[ClientQueryType]):
169172
170173
MetadataPostProcessor should be idempotent and not order-dependent, i.e.
171174
all MetadataPostProcessor instances should be able to run in parallel.
172-
173175
"""
174176

175177
async def process(self, doc_details: DocDetails, **kwargs) -> DocDetails:

0 commit comments

Comments
 (0)