1- from typing import List , Dict , NamedTuple , Optional , Set
1+ """
2+ This submodule contains a data structure for storing lexical
3+ indexes over various biomedical vocabularies.
4+
5+ There are several built-in vocabularies, which can be imported
6+ and instantiated like in:
7+
8+ .. code-block:: python
9+
10+ from scispacy.linking_utils import UmlsKnowledgeBase
11+
12+ kb = UmlsKnowledgeBase()
13+
14+ In general, new :class:`KnowledgeBase` objects can be constructed
15+ from a list of :class:`Entity` objects, or a path to a JSON or JSONL
16+ file containing dictionaries shaped the same way:
17+
18+ .. code-block:: python
19+
20+ from scispacy.linking_utils import KnowledgeBase
21+
22+ # UMLS
23+ kb = KnowledgeBase(
24+ "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/"
25+ "data/kbs/2023-04-23/umls_mesh_2022.jsonl"
26+ )
27+
28+ """
29+
230import json
331from collections import defaultdict
32+ from contextlib import contextmanager
33+ from pathlib import Path
34+ from typing import (
35+ List ,
36+ Dict ,
37+ NamedTuple ,
38+ Optional ,
39+ Set ,
40+ Union ,
41+ Iterable ,
42+ Tuple ,
43+ DefaultDict ,
44+ Generator ,
45+ )
446
547from scispacy .file_cache import cached_path
648from scispacy .umls_semantic_type_tree import (
749 UmlsSemanticTypeTree ,
850 construct_umls_tree_from_tsv ,
951)
1052
53+ __all__ = [
54+ "Entity" ,
55+ "KnowledgeBase" ,
56+ "UmlsKnowledgeBase" ,
57+ "Mesh" ,
58+ "GeneOntology" ,
59+ "HumanPhenotypeOntology" ,
60+ "RxNorm" ,
61+ ]
62+
1163
1264class Entity (NamedTuple ):
1365 concept_id : str
@@ -38,6 +90,53 @@ def __repr__(self):
3890DEFAULT_UMLS_TYPES_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv"
3991
4092
93+ @contextmanager
94+ def _iter_entities (
95+ path_or_entities : Union [str , Path , Iterable [Entity ]],
96+ ) -> Generator [Iterable [Entity ], None , None ]:
97+ """Iterate through entities from a JSON file, JSONL file, or pass through an existing iterable."""
98+ if isinstance (path_or_entities , (str , Path )):
99+ # normalize paths
100+ path_or_entities = cached_path (path_or_entities )
101+
102+ # do the following inside a context manager to
103+ # make sure the file gets closed properly
104+ with open (path_or_entities ) as file :
105+ if path_or_entities .endswith ("jsonl" ):
106+ yield (Entity (** json .loads (line )) for line in file )
107+ else :
108+ yield (Entity (** record ) for record in json .load (file ))
109+ else :
110+ yield path_or_entities
111+
112+
113+ def _index_entities (
114+ entities : Iterable [Entity ],
115+ ) -> Tuple [Dict [str , Entity ], Dict [str , Set [str ]]]:
116+ """Create indexes over entities for use in a :class:`KnowledgeBase`.
117+
118+ Parameters
119+ ----------
120+ entities :
121+ An iterable (e.g., a list) of entity objects
122+
123+ Returns
124+ -------
125+ A pair of indexes for:
126+
127+ 1. A mapping from local unique identifiers (e.g., CUIs for UMLS) to entity objects
128+ 2. A mapping from aliases (e.g., canonical names, aliases) to local unique identifiers
129+ """
130+ cui_to_entity : Dict [str , Entity ] = {}
131+ alias_to_cuis : DefaultDict [str , Set [str ]] = defaultdict (set )
132+ for entity in entities :
133+ alias_to_cuis [entity .canonical_name ].add (entity .concept_id )
134+ for alias in entity .aliases :
135+ alias_to_cuis [alias ].add (entity .concept_id )
136+ cui_to_entity [entity .concept_id ] = entity
137+ return cui_to_entity , dict (alias_to_cuis )
138+
139+
41140class KnowledgeBase :
42141 """
43142 A class representing two commonly needed views of a Knowledge Base:
@@ -50,31 +149,20 @@ class KnowledgeBase:
50149 The file path to the json/jsonl representation of the KB to load.
51150 """
52151
152+ cui_to_entity : Dict [str , Entity ]
153+ alias_to_cuis : Dict [str , Set [str ]]
154+
53155 def __init__ (
54156 self ,
55- file_path : Optional [ str ] = None ,
157+ file_path : Union [ None , str , Path , Iterable [ Entity ] ] = None ,
56158 ):
57159 if file_path is None :
58160 raise ValueError (
59161 "Do not use the default arguments to KnowledgeBase. "
60162 "Instead, use a subclass (e.g UmlsKnowledgeBase) or pass a path to a kb."
61163 )
62- if file_path .endswith ("jsonl" ):
63- raw = (json .loads (line ) for line in open (cached_path (file_path )))
64- else :
65- raw = json .load (open (cached_path (file_path )))
66-
67- alias_to_cuis : Dict [str , Set [str ]] = defaultdict (set )
68- self .cui_to_entity : Dict [str , Entity ] = {}
69-
70- for concept in raw :
71- unique_aliases = set (concept ["aliases" ])
72- unique_aliases .add (concept ["canonical_name" ])
73- for alias in unique_aliases :
74- alias_to_cuis [alias ].add (concept ["concept_id" ])
75- self .cui_to_entity [concept ["concept_id" ]] = Entity (** concept )
76-
77- self .alias_to_cuis : Dict [str , Set [str ]] = {** alias_to_cuis }
164+ with _iter_entities (file_path ) as entities :
165+ self .cui_to_entity , self .alias_to_cuis = _index_entities (entities )
78166
79167
80168class UmlsKnowledgeBase (KnowledgeBase ):
0 commit comments