diff --git a/docs/parsers.rst b/docs/parsers.rst new file mode 100644 index 00000000..9b426bf7 --- /dev/null +++ b/docs/parsers.rst @@ -0,0 +1,126 @@ +Parsers +======== + +Field descriptions +------------------ + +Taken from: + +https://www.nlm.nih.gov/healthit/snomedct/us_edition.html + +Download a zip file there, and inside there will be the following PDF, which documents the fields as shown below. + +doc_Icd10cmMapReleaseNotes_Current-en-US_US1000124_20210901.pdf + +More info: + +https://www.nlm.nih.gov/research/umls/mapping_projects/snomedct_to_icd10cm.html + +FIELD,DATA_TYPE,PURPOSE,SSSOM Dev Comments +- id,UUID,A 128 bit unsigned integer, uniquely identifying the map record, +- effectiveTime,Time,Specifies the inclusive date at which this change becomes effective., +- active,Boolean,Specifies whether the member’s state was active (=1) or inactive (=0) from the nominal release date specified by the effectiveTime field., +- moduleId,SctId,Identifies the member version’s module. Set to a child of 900000000000443000|Module| within the metadata hierarchy.,The only value in the entire set is '5991000124107', which has label 'SNOMED CT to ICD-10-CM rule-based mapping module' (https://www.findacode.com/snomed/5991000124107--snomed-ct-to-icd-10-cm-rule-based-mapping-module.html). +- refSetId,SctId,Set to one of the children of the |Complex map type| concept in the metadata hierarchy.,The only value in the entire set is '5991000124107', which has label 'ICD-10-CM complex map reference set' (https://www.findacode.com/snomed/6011000124106--icd-10-cm-complex-map-reference-set.html). +- referencedComponentId,SctId,The SNOMED CT source concept ID that is the subject of the map record., +- mapGroup,Integer,An integer identifying a grouping of complex map records which will designate one map target at the time of map rule evaluation. Source concepts that require two map targets for classification will have two sets of map groups., +- mapPriority,Integer,Within a map group, the mapPriority specifies the order in which complex map records should be evaluated to determine the correct map target., +- mapRule,String,A machine-readable rule, (evaluating to either ‘true’ or ‘false’ at run-time) that indicates whether this map record should be selected within its map group., +- mapAdvice,String,Human-readable advice that may be employed by the software vendor to give an end-user advice on selection of the appropriate target code. This includes a) a summary statement of the map rule logic, b) a statement of any limitations of the map record and c) additional classification guidance for the coding professional., +- mapTarget,String,The target ICD-10 classification code of the map record., +- correlationId,SctId,A child of |Map correlation value| in the metadata hierarchy, identifying the correlation between the SNOMED CT concept and the target code., +- mapCategoryId,SctId,Identifies the SNOMED CT concept in the metadata hierarchy which is the MapCategory for the associated map record. This is a subtype of 447634004 |ICD-10 Map Category value|., + +Mappings: SSSOM::SNOMED_Complex_Map +----------------------------------- +Copy/pasta of state of mappings as of 2022/03/04: + +'subject_id': f'SNOMED:{row["referencedComponentId"]}', +'subject_label': row['referencedComponentName'], + +# 'predicate_id': 'skos:exactMatch', +# - mapCategoryId: can use for mapping predicate? Or is correlationId more suitable? +# or is there a SKOS predicate I can map to in case where predicate is unknown? I think most of these +# mappings are attempts at exact matches, but I can't be sure (at least not without using these fields +# to determine: mapGroup, mapPriority, mapRule, mapAdvice). +# mapCategoryId,mapCategoryName: Only these in set: 447637006 "MAP SOURCE CONCEPT IS PROPERLY CLASSIFIED", +# 447638001 "MAP SOURCE CONCEPT CANNOT BE CLASSIFIED WITH AVAILABLE DATA", +# 447639009 "MAP OF SOURCE CONCEPT IS CONTEXT DEPENDENT" +# 'predicate_modifier': '???', +# Description: Modifier for negating the prediate. See https://github.com/mapping-commons/sssom/issues/40 +# Range: PredicateModifierEnum: (joe: only lists 'Not' as an option) +# Example: Not Negates the predicate, see documentation of predicate_modifier_enum +# - predicate_id <- mapAdvice? +# - predicate_modifier <- mapAdvice? +# mapAdvice: Pipe-delimited qualifiers. Ex: +# "ALWAYS Q71.30 | CONSIDER LATERALITY SPECIFICATION" +# "IF LISSENCEPHALY TYPE 3 FAMILIAL FETAL AKINESIA SEQUENCE SYNDROME CHOOSE Q04.3 | MAP OF SOURCE CONCEPT +# IS CONTEXT DEPENDENT" +# "MAP SOURCE CONCEPT CANNOT BE CLASSIFIED WITH AVAILABLE DATA" +'predicate_id': f'SNOMED:{row["mapCategoryId"]}', +'predicate_label': row['mapCategoryName'], + +'object_id': f'ICD10CM:{row["mapTarget"]}', +'object_label': row['mapTargetName'], + +# match_type <- mapRule? +# ex: TRUE: when "ALWAYS " is in pipe-delimited list in mapAdvice, this always shows TRUE. Does this +# mean I could use skos:exactMatch in these cases? +# match_type <- correlationId?: This may look redundant, but I want to be explicit. In officially downloaded +# SNOMED mappings, all of them had correlationId of 447561005, which also happens to be 'unspecified'. +# If correlationId is indeed more appropriate for predicate_id, then I don't think there is a representative +# field for 'match_type'. +'match_type': MatchTypeEnum('Unspecified') if row['correlationId'] == match_type_snomed_unspecified_id \ + else MatchTypeEnum('Unspecified'), + +'mapping_date': date_parser.parse(str(row['effectiveTime'])).date(), +'other': '|'.join([f'{k}={str(row[k])}' for k in [ + 'id', + 'active', + 'moduleId', + 'refsetId', + 'mapGroup', + 'mapPriority', + 'mapRule', + 'mapAdvice', +]]), + +# More fields (https://mapping-commons.github.io/sssom/Mapping/): +# - subject_category: absent +# - author_id: can this be "SNOMED"? +# - author_label: can this be "SNOMED"? +# - reviewer_id: can this be "SNOMED"? +# - reviewer_label: can this be "SNOMED"? +# - creator_id: can this be "SNOMED"? +# - creator_label: can this be "SNOMED"? +# - license: Is this something that can be determined? +# - subject_source: URL of some official page for SNOMED version used? +# - subject_source_version: Is this knowable? +# - objectCategory <= mapRule? +# mapRule: ex: TRUE: when "ALWAYS " is in pipe-delimited list in mapAdvice, this always shows TRUE. +# Does this mean I could use skos:exactMatch in these cases? +# object_category: +# objectCategory: +# Description: The conceptual category to which the subject belongs to. This can be a string denoting +# the category or a term from a controlled vocabulary. +# Example: UBERON:0001062 (The CURIE of the Uberon term for "anatomical entity".) +# - object_source: URL of some official page for ICD10CM version used? +# - object_source_version: would this be "10CM" as in "ICD10CM"? Or something else? Or nothing? +# - mapping_provider: can this be "SNOMED"? +# - mapping_cardinality: Could I determine 1:1 or 1:many or many:1 based on: +# mapGroup, mapPriority, mapRule, mapAdvice? +# - match_term_type: What is this? +# - see_also: Should this be a URL to the SNOMED term? +# - comment: Description: Free text field containing either curator notes or text generated by tool providing +# additional informative information. + + +SNOMED mapping related codes +---------------------------- +match_type_snomed_unspecified_id = 447561005 +https://www.findacode.com/snomed/447561005--snomed-ct-source-code-to-target-map-correlation-not-specified.html + +Additional resources +-------------------- +About SNOMED simple and complex refsets: +https://github.com/HOT-Ecosystem/tccm/blob/master/docs/SNOMED/MapRefsets.md diff --git a/sssom/parsers.py b/sssom/parsers.py index 33ea5206..73f5b34b 100644 --- a/sssom/parsers.py +++ b/sssom/parsers.py @@ -15,13 +15,10 @@ import pandas as pd import validators import yaml +from dateutil import parser as date_parser from deprecation import deprecated from linkml_runtime.loaders.json_loader import JSONLoader from rdflib import Graph, URIRef - -# from .sssom_datamodel import Mapping, MappingSet -from sssom_schema import Mapping, MappingSet - from sssom.constants import ( CONFIDENCE, CURIE_MAP, @@ -43,6 +40,7 @@ SUBJECT_SOURCE, SUBJECT_SOURCE_ID, ) +from sssom_schema import Mapping, MappingSet from .context import ( DEFAULT_LICENSE, @@ -261,6 +259,32 @@ def parse_obographs_json( ) +def parse_snomed_complex_map_tsv( + file_path: str, + prefix_map: Dict[str, str] = None, + meta: Dict[str, str] = None, + filter_by_confident_mappings=True, +) -> MappingSetDataFrame: + """Parse special SNOMED ICD10CM mapping file and translates it into a MappingSetDataFrame. + + :param file_path: The path to the source file + :param prefix_map: An optional prefix map, defaults to None + :param meta: An optional dictionary of metadata elements, defaults to None + :param filter_by_confident_mappings: Will only include mapping rows where the + `mapAdvice` field includes an 'ALWAYS ' pattern., defaults to True + :return: A SSSOM MappingSetDataFrame + """ + raise_for_bad_path(file_path) + df = read_pandas(file_path) + df2 = from_snomed_complex_map_tsv( + df, + prefix_map=prefix_map, + meta=meta, + filter_by_confident_mappings=filter_by_confident_mappings, + ) + return df2 + + def _get_prefix_map_and_metadata( prefix_map: Optional[PrefixMap] = None, meta: Optional[MetadataType] = None ) -> Metadata: @@ -307,7 +331,6 @@ def _init_mapping_set(meta: Optional[MetadataType]) -> MappingSet: def _get_mdict_ms_and_bad_attrs( row: pd.Series, ms: MappingSet, bad_attrs: Counter ) -> Tuple[dict, MappingSet, Counter]: - mdict = {} for k, v in row.items(): @@ -666,6 +689,174 @@ def from_obographs( return to_mapping_set_dataframe(mdoc) +def from_snomed_complex_map_tsv( + df: pd.DataFrame, + prefix_map: Optional[PrefixMap] = None, + meta: Optional[MetadataType] = None, + filter_by_confident_mappings=True, +) -> MappingSetDataFrame: + """Convert a snomed_icd10cm_map dataframe to a MappingSetDataFrame. + + :param df: A mappings dataframe + :param prefix_map: A prefix map + :param meta: A metadata dictionary + :param filter_by_confident_mappings: Will only include mapping rows where the `mapAdvice` field includes an 'ALWAYS + ' pattern. + :return: MappingSetDataFrame + + # Field descriptions + # - Taken from: doc_Icd10cmMapReleaseNotes_Current-en-US_US1000124_20210901.pdf + FIELD,DATA_TYPE,PURPOSE,Joe's comments + - id,UUID,A 128 bit unsigned integer, uniquely identifying the map record, + - effectiveTime,Time,Specifies the inclusive date at which this change becomes effective., + - active,Boolean,Specifies whether the member’s state was active (=1) or inactive (=0) from the nominal release date + specified by the effectiveTime field., + - moduleId,SctId,Identifies the member version’s module. Set to a child of 900000000000443000|Module| within the + metadata hierarchy.,The only value in the entire set is '5991000124107', which has label 'SNOMED CT to ICD-10-CM + rule-based mapping module' ( + https://www.findacode.com/snomed/5991000124107--snomed-ct-to-icd-10-cm-rule-based-mapping-module.html). + - refSetId,SctId,Set to one of the children of the |Complex map type| concept in the metadata hierarchy.,The only + value in the entire set is '5991000124107', which has label 'ICD-10-CM complex map reference set' ( + https://www.findacode.com/snomed/6011000124106--icd-10-cm-complex-map-reference-set.html). + - referencedComponentId,SctId,The SNOMED CT source concept ID that is the subject of the map record., + - mapGroup,Integer,An integer identifying a grouping of complex map records which will designate one map target at + the time of map rule evaluation. Source concepts that require two map targets for classification will have two sets + of map groups., + - mapPriority,Integer,Within a map group, the mapPriority specifies the order in which complex map records should be + evaluated to determine the correct map target., + - mapRule,String,A machine-readable rule, (evaluating to either ‘true’ or ‘false’ at run-time) that indicates + whether this map record should be selected within its map group., + - mapAdvice,String,Human-readable advice that may be employed by the software vendor to give an end-user advice on + selection of the appropriate target code. This includes a) a summary statement of the map rule logic, b) a statement + of any limitations of the map record and c) additional classification guidance for the coding professional., + - mapTarget,String,The target ICD-10 classification code of the map record., + - correlationId,SctId,A child of |Map correlation value| in the metadata hierarchy, identifying the correlation + between the SNOMED CT concept and the target code., + - mapCategoryId,SctId,Identifies the SNOMED CT concept in the metadata hierarchy which is the MapCategory for the + associated map record. This is a subtype of 447634004 |ICD-10 Map Category value|., + """ + # Local variables + # https://www.findacode.com/snomed/447561005--snomed-ct-source-code-to-target-map-correlation-not-specified.html + mapping_justification_snomed_unspecified_id = 447561005 + # - Note: joeflack4: I used this info as a reference for this pattern. + # https://www.medicalbillingandcoding.org/icd-10-cm/#:~:text=ICD%2D10%2DCM%20is%20a,decimal%20point%20and%20the%20subcategory. + always_confidence_pattern = r"ALWAYS [A-Z]{1}[0-9]{1,2}\.[0-9A-Z]{1,4}" + always_confidence_antipattern = always_confidence_pattern + r"\?" + prefix_map = _ensure_prefix_map(prefix_map) + ms = _init_mapping_set(meta) + + # Filtering + if filter_by_confident_mappings: + df = df[ + ( + df["mapAdvice"].str.contains( + always_confidence_pattern, regex=True, na=False + ) + ) + & ( + ~df["mapAdvice"].str.contains( + always_confidence_antipattern, regex=True, na=False + ) + ) + ] + + # Map mappings + mlist: List[Mapping] = [] + for _, row in df.iterrows(): + mdict = { + "subject_id": f'SNOMED:{row["referencedComponentId"]}', + "subject_label": row["referencedComponentName"], + # 'predicate_id': 'skos:exactMatch', + # - mapCategoryId: can use for mapping predicate? Or is correlationId more suitable? + # or is there a SKOS predicate I can map to in case where predicate is unknown? I think most of these + # mappings are attempts at exact matches, but I can't be sure (at least not without using these fields + # to determine: mapGroup, mapPriority, mapRule, mapAdvice). + # mapCategoryId,mapCategoryName: Only these in set: 447637006 "MAP SOURCE CONCEPT IS PROPERLY CLASSIFIED", + # 447638001 "MAP SOURCE CONCEPT CANNOT BE CLASSIFIED WITH AVAILABLE DATA", + # 447639009 "MAP OF SOURCE CONCEPT IS CONTEXT DEPENDENT" + # 'predicate_modifier': '???', + # Description: Modifier for negating the prediate. See https://github.com/mapping-commons/sssom/issues/40 + # Range: PredicateModifierEnum: (joe: only lists 'Not' as an option) + # Example: Not Negates the predicate, see documentation of predicate_modifier_enum + # - predicate_id <- mapAdvice? + # - predicate_modifier <- mapAdvice? + # mapAdvice: Pipe-delimited qualifiers. Ex: + # "ALWAYS Q71.30 | CONSIDER LATERALITY SPECIFICATION" + # "IF LISSENCEPHALY TYPE 3 FAMILIAL FETAL AKINESIA SEQUENCE SYNDROME CHOOSE Q04.3 | MAP OF SOURCE CONCEPT + # IS CONTEXT DEPENDENT" + # "MAP SOURCE CONCEPT CANNOT BE CLASSIFIED WITH AVAILABLE DATA" + "predicate_id": f'SNOMED:{row["mapCategoryId"]}', + "predicate_label": row["mapCategoryName"], + "object_id": f'ICD10CM:{row["mapTarget"]}', + "object_label": row["mapTargetName"], + # mapping_justification <- mapRule? + # ex: TRUE: when "ALWAYS " is in pipe-delimited list in mapAdvice, this always shows TRUE. Does this + # mean I could use skos:exactMatch in these cases? + # mapping_justification <- correlationId?: This may look redundant, but I want to be explicit. In officially downloaded + # SNOMED mappings, all of them had correlationId of 447561005, which also happens to be 'unspecified'. + # If correlationId is indeed more appropriate for predicate_id, then I don't think there is a representative + # field for 'mapping_justification'. + # TODO: How to properly get mapping_justification? + # I think I need to use sssom_schema.slots.mapping_justification, but not sure how to use. + # slots.mapping_justification = Slot(uri=SSSOM.mapping_justification, name="mapping_justification", curie=SSSOM.curie('mapping_justification'), + # model_uri=SSSOM.mapping_justification, domain=None, range=Union[str, EntityReference], + # pattern=re.compile(r'^semapv:(MappingReview|ManualMappingCuration|LogicalReasoning|LexicalMatching|CompositeMatching|UnspecifiedMatching|SemanticSimilarityThresholdMatching|LexicalSimilarityThresholdMatching|MappingChaining)$')) + "mapping_justification": "Unspecified" + if row["correlationId"] == mapping_justification_snomed_unspecified_id + else "Unspecified", + "mapping_date": date_parser.parse(str(row["effectiveTime"])).date(), + "other": "|".join( + [ + f"{k}={str(row[k])}" + for k in [ + "id", + "active", + "moduleId", + "refsetId", + "mapGroup", + "mapPriority", + "mapRule", + "mapAdvice", + ] + ] + ), + # More fields (https://mapping-commons.github.io/sssom/Mapping/): + # - subject_category: absent + # - author_id: can this be "SNOMED"? + # - author_label: can this be "SNOMED"? + # - reviewer_id: can this be "SNOMED"? + # - reviewer_label: can this be "SNOMED"? + # - creator_id: can this be "SNOMED"? + # - creator_label: can this be "SNOMED"? + # - license: Is this something that can be determined? + # - subject_source: URL of some official page for SNOMED version used? + # - subject_source_version: Is this knowable? + # - objectCategory <= mapRule? + # mapRule: ex: TRUE: when "ALWAYS " is in pipe-delimited list in mapAdvice, this always shows TRUE. + # Does this mean I could use skos:exactMatch in these cases? + # object_category: + # objectCategory: + # Description: The conceptual category to which the subject belongs to. This can be a string denoting + # the category or a term from a controlled vocabulary. + # Example: UBERON:0001062 (The CURIE of the Uberon term for "anatomical entity".) + # - object_source: URL of some official page for ICD10CM version used? + # - object_source_version: would this be "10CM" as in "ICD10CM"? Or something else? Or nothing? + # - mapping_provider/mapping_tool: can this be "SNOMED"? + # - mapping_cardinality: Could I determine 1:1 or 1:many or many:1 based on: + # mapGroup, mapPriority, mapRule, mapAdvice? + # - match_term_type: What is this? + # - see_also: Should this be a URL to the SNOMED term? + # - comment: Description: Free text field containing either curator notes or text generated by tool providing + # additional informative information. + } + mlist.append(_prepare_mapping(Mapping(**mdict))) + + ms.mappings = mlist + _set_metadata_in_mapping_set(mapping_set=ms, metadata=meta) + doc = MappingSetDocument(mapping_set=ms, prefix_map=prefix_map) + return to_mapping_set_dataframe(doc) + + # All from_* take as an input a python object (data frame, json, etc) and return a MappingSetDataFrame # All read_* take as an input a a file handle and return a MappingSetDataFrame (usually wrapping a from_* method) @@ -690,6 +881,8 @@ def get_parsing_function(input_format: Optional[str], filename: str) -> Callable return parse_alignment_xml elif input_format == "obographs-json": return parse_obographs_json + elif input_format == "snomed-complex-map-tsv": + return parse_snomed_complex_map_tsv else: raise Exception(f"Unknown input format: {input_format}") diff --git a/sssom/util.py b/sssom/util.py index 8906703f..56d0bf6a 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -69,12 +69,13 @@ PREFIX_MAP_KEY = "curie_map" SSSOM_READ_FORMATS = [ - "tsv", - "rdf", + "json", "owl", + "rdf", + "tsv", "alignment-api-xml", "obographs-json", - "json", + "snomed-complex-map-tsv", ] SSSOM_EXPORT_FORMATS = ["tsv", "rdf", "owl", "json", "fhir"]