1212
1313logger = logging .getLogger (__name__ )
1414
15- MAX_REFERENCES = 400
15+ _COMMON_REFERENCES = 400
16+ _MOST_REFERENCES = 2000
1617
1718
18- class HandleReferences (Enum ):
19+ class EnrichReferences (Enum ):
1920 """How to handle references when building an openalex collection."""
2021
2122 BASIC = "basic"
2223 COMMON = "common"
24+ MOST = "most"
2325 FULL = "full"
2426
2527
@@ -30,12 +32,12 @@ def __init__(
3032 self ,
3133 query : str ,
3234 limit : int = 600 ,
33- references : HandleReferences = HandleReferences .BASIC ,
35+ enrich : EnrichReferences = EnrichReferences .BASIC ,
3436 client : Optional [OpenAlexClient ] = None ,
3537 ) -> None :
3638 self .query = query
3739 self .limit = limit
38- self .references = references
40+ self .enrich = enrich
3941 self .client = client or OpenAlexClient ()
4042
4143 def build (self ) -> Collection :
@@ -44,21 +46,23 @@ def build(self) -> Collection:
4446 works = self .client .list_recent_articles (self .query , self .limit )
4547 cache = {work .id : work for work in works }
4648 references : list [str ] = []
49+ missing = set ()
4750 for work in works :
4851 references .extend (work .referenced_works )
49- if self .references == HandleReferences .COMMON :
52+ if self .enrich in ( EnrichReferences .COMMON , EnrichReferences . MOST ) :
5053 counter = Counter (references )
51- most_common = {key for key , _ in counter .most_common (MAX_REFERENCES )}
54+ count = (
55+ _MOST_REFERENCES
56+ if self .enrich == EnrichReferences .MOST
57+ else _COMMON_REFERENCES
58+ )
59+ most_common = {key for key , _ in counter .most_common (count )}
5260 missing = most_common - set (cache .keys ())
53- logger .info ("fetching %d missing references" , len (missing ))
54- missing_works = self .client .list_articles_by_openalex_id (list (missing ))
55- cache .update ({work .id : work for work in missing_works })
56- if self .references == HandleReferences .FULL :
61+ if self .enrich == EnrichReferences .FULL :
5762 missing = set (references ) - set (cache .keys ())
58- logger .info ("fetching %d missing references" , len (missing ))
59- missing_works = self .client .list_articles_by_openalex_id (list (missing ))
60- cache .update ({work .id : work for work in missing_works })
61-
63+ logger .info ("fetching %d missing references" , len (missing ))
64+ missing_works = self .client .list_articles_by_openalex_id (list (missing ))
65+ cache .update ({work .id : work for work in missing_works })
6266 article_cache = {
6367 openalexid : self ._work_to_article (work )
6468 for openalexid , work in cache .items ()
0 commit comments