Skip to content

Commit 49c6472

Browse files
authored
add enrich most references option (#35)
* Add option to pull most references * Change to enrich references * Update version
1 parent 2b03aa0 commit 49c6472

File tree

4 files changed

+29
-23
lines changed

4 files changed

+29
-23
lines changed

src/bibx/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from bibx.algorithms.sap import Sap
77
from bibx.article import Article
8-
from bibx.builders.openalex import HandleReferences, OpenAlexCollectionBuilder
8+
from bibx.builders.openalex import EnrichReferences, OpenAlexCollectionBuilder
99
from bibx.builders.scopus_bib import ScopusBibCollectionBuilder
1010
from bibx.builders.scopus_ris import ScopusRisCollectionBuilder
1111
from bibx.builders.wos import WosCollectionBuilder
@@ -17,7 +17,7 @@
1717
__all__ = [
1818
"Article",
1919
"Collection",
20-
"HandleReferences",
20+
"EnrichReferences",
2121
"Sap",
2222
"query_openalex",
2323
"read_any",
@@ -26,16 +26,16 @@
2626
"read_wos",
2727
]
2828

29-
__version__ = "0.4.1"
29+
__version__ = "0.5.0"
3030

3131

3232
def query_openalex(
3333
query: str,
3434
limit: int = 600,
35-
references: HandleReferences = HandleReferences.BASIC,
35+
enrich: EnrichReferences = EnrichReferences.BASIC,
3636
) -> Collection:
3737
"""Query OpenAlex and return a collection."""
38-
return OpenAlexCollectionBuilder(query, limit, references=references).build()
38+
return OpenAlexCollectionBuilder(query, limit, enrich=enrich).build()
3939

4040

4141
def read_scopus_bib(*files: TextIO) -> Collection:

src/bibx/builders/openalex.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,16 @@
1212

1313
logger = logging.getLogger(__name__)
1414

15-
MAX_REFERENCES = 400
15+
_COMMON_REFERENCES = 400
16+
_MOST_REFERENCES = 2000
1617

1718

18-
class HandleReferences(Enum):
19+
class EnrichReferences(Enum):
1920
"""How to handle references when building an openalex collection."""
2021

2122
BASIC = "basic"
2223
COMMON = "common"
24+
MOST = "most"
2325
FULL = "full"
2426

2527

@@ -30,12 +32,12 @@ def __init__(
3032
self,
3133
query: str,
3234
limit: int = 600,
33-
references: HandleReferences = HandleReferences.BASIC,
35+
enrich: EnrichReferences = EnrichReferences.BASIC,
3436
client: Optional[OpenAlexClient] = None,
3537
) -> None:
3638
self.query = query
3739
self.limit = limit
38-
self.references = references
40+
self.enrich = enrich
3941
self.client = client or OpenAlexClient()
4042

4143
def build(self) -> Collection:
@@ -44,21 +46,23 @@ def build(self) -> Collection:
4446
works = self.client.list_recent_articles(self.query, self.limit)
4547
cache = {work.id: work for work in works}
4648
references: list[str] = []
49+
missing = set()
4750
for work in works:
4851
references.extend(work.referenced_works)
49-
if self.references == HandleReferences.COMMON:
52+
if self.enrich in (EnrichReferences.COMMON, EnrichReferences.MOST):
5053
counter = Counter(references)
51-
most_common = {key for key, _ in counter.most_common(MAX_REFERENCES)}
54+
count = (
55+
_MOST_REFERENCES
56+
if self.enrich == EnrichReferences.MOST
57+
else _COMMON_REFERENCES
58+
)
59+
most_common = {key for key, _ in counter.most_common(count)}
5260
missing = most_common - set(cache.keys())
53-
logger.info("fetching %d missing references", len(missing))
54-
missing_works = self.client.list_articles_by_openalex_id(list(missing))
55-
cache.update({work.id: work for work in missing_works})
56-
if self.references == HandleReferences.FULL:
61+
if self.enrich == EnrichReferences.FULL:
5762
missing = set(references) - set(cache.keys())
58-
logger.info("fetching %d missing references", len(missing))
59-
missing_works = self.client.list_articles_by_openalex_id(list(missing))
60-
cache.update({work.id: work for work in missing_works})
61-
63+
logger.info("fetching %d missing references", len(missing))
64+
missing_works = self.client.list_articles_by_openalex_id(list(missing))
65+
cache.update({work.id: work for work in missing_works})
6266
article_cache = {
6367
openalexid: self._work_to_article(work)
6468
for openalexid, work in cache.items()

src/bibx/cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
)
1717
from bibx.algorithms.preprocess import Preprocess
1818
from bibx.algorithms.sap import Sap
19-
from bibx.builders.openalex import HandleReferences
19+
from bibx.builders.openalex import EnrichReferences
2020
from bibx.collection import Collection
2121

2222
app = typer.Typer()
@@ -80,9 +80,9 @@ def sap(filename: str) -> None:
8080
@app.command()
8181
def openalex(
8282
query: list[str],
83-
references: HandleReferences = typer.Option(
83+
enrich: EnrichReferences = typer.Option(
8484
help="how to handle references",
85-
default=HandleReferences.BASIC,
85+
default=EnrichReferences.BASIC,
8686
),
8787
verbose: bool = typer.Option(
8888
help="be more verbose",
@@ -92,7 +92,7 @@ def openalex(
9292
"""Run the sap algorithm on a seed file of any supported format."""
9393
if verbose:
9494
logging.basicConfig(level=logging.INFO)
95-
c = query_openalex(" ".join(query), references=references)
95+
c = query_openalex(" ".join(query), enrich=enrich)
9696
s = Sap()
9797
graph = s.create_graph(c)
9898
graph = s.clean_graph(graph)

src/bibx/clients/openalex.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ def list_recent_articles(self, query: str, limit: int = 600) -> list[Work]:
171171

172172
def list_articles_by_openalex_id(self, ids: list[str]) -> list[Work]:
173173
"""List articles by openalex id."""
174+
if not ids:
175+
return []
174176
select = ",".join(Work.model_fields.keys())
175177
results: list[Work] = []
176178
with ThreadPoolExecutor(max_workers=5) as executor:

0 commit comments

Comments
 (0)