diff --git a/CodeListLibrary_project/clinicalcode/migrations/0134_alter_omop_codes_valid_end_date_omoprelationships.py b/CodeListLibrary_project/clinicalcode/migrations/0134_alter_omop_codes_valid_end_date_omoprelationships.py new file mode 100644 index 000000000..7bd46e95f --- /dev/null +++ b/CodeListLibrary_project/clinicalcode/migrations/0134_alter_omop_codes_valid_end_date_omoprelationships.py @@ -0,0 +1,37 @@ +# Generated by Django 5.1.8 on 2026-02-09 14:32 + +import datetime +import django.contrib.postgres.indexes +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('clinicalcode', '0133_omop_codes'), + ] + + operations = [ + migrations.AlterField( + model_name='omop_codes', + name='valid_end_date', + field=models.DateField(blank=True, default=datetime.date(2099, 12, 31)), + ), + migrations.CreateModel( + name='OMOPRelationships', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False)), + ('relationship', models.CharField(max_length=64)), + ('valid_start_date', models.DateField(auto_now_add=True)), + ('valid_end_date', models.DateField(blank=True, default=datetime.date(2099, 12, 31))), + ('invalid_reason', models.CharField(blank=True, choices=[('D', 'Deprecated'), ('U', 'Upgraded')], default=None, max_length=1, null=True)), + ('code0', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='source_relationships', to='clinicalcode.omop_codes', to_field='code')), + ('code1', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='destination_relationships', to='clinicalcode.omop_codes', to_field='code')), + ], + options={ + 'indexes': [models.Index(fields=['code0'], name='clinicalcod_code0_i_9f4db1_idx'), models.Index(fields=['code1'], name='clinicalcod_code1_i_fbd524_idx'), models.Index(fields=['code0', 'code1'], name='clinicalcod_code0_i_634ab7_idx'), models.Index(fields=['code0', 'relationship', 'invalid_reason'], name='clinicalcod_code0_i_bea734_idx'), django.contrib.postgres.indexes.GinIndex(fields=['code0'], name='omop_e0_trgm_idx', opclasses=['gin_trgm_ops']), django.contrib.postgres.indexes.GinIndex(fields=['code0', 'relationship', 'invalid_reason'], name='omop_e0t_tgbt_idx', opclasses=['', 'gin_trgm_ops', ''])], + 'unique_together': {('code0', 'code1', 'relationship')}, + }, + ), + ] diff --git a/CodeListLibrary_project/clinicalcode/models/OMOP_CODES.py b/CodeListLibrary_project/clinicalcode/models/OMOP_CODES.py index ac89d262f..847cc545b 100644 --- a/CodeListLibrary_project/clinicalcode/models/OMOP_CODES.py +++ b/CodeListLibrary_project/clinicalcode/models/OMOP_CODES.py @@ -2,6 +2,8 @@ from django.utils.translation import gettext_lazy as _ from django.contrib.postgres.indexes import GinIndex +import datetime + class StandardFlag(models.TextChoices): """ See `standard_concept` column of `CONCEPT` table found in `OHDSI docs`_. @@ -11,6 +13,7 @@ class StandardFlag(models.TextChoices): STANDARD = 'S', _('Standard Concept') CLASSIFICATION = 'C', _('Classification Concept') + class InvalidFlag(models.TextChoices): """ See `invalid_reason` column of `CONCEPT` table found in `OHDSI docs`_. @@ -20,106 +23,181 @@ class InvalidFlag(models.TextChoices): DEPRECATED = 'D', _('Deprecated') UPGRADED = 'U', _('Upgraded') + class OMOP_CODES(models.Model): - """ - Represents a standardised `OMOP`_ code from its Common Data Model, related to :model:`clinicalcode.CodingSystem`. - - Version - ------- - Vocabulary version: `v20250827` - - Reference - --------- - See `OHDSI Single-page docs`_. - - Mapping - ---------- - | Attribute | Table->Column | - |:---------------------|:---------------------------------| - | `code` | `CONCEPT->concept_id` | - | `description` | `CONCEPT->concept_name` | - | `is_code` | `N/A` | - | `is_valid` | `N/A` | - | `standard_concept` | `CONCEPT->standard_concept` | - | `coding_name` | `N/A` | - | `coding_system_id` | `N/A` | - | `domain_name` | `CONCEPT->domain_id` | - | `class_name` | `CONCEPT->concept_class_id` | - | `vocabulary_name` | `CONCEPT->vocabulary_id` | - | `vocabulary_code` | `CONCEPT->concept_code` | - | `vocabulary_version` | `VOCABULARY->vocabulary_version` | - | `valid_start_date` | `CONCEPT->valid_start_date` | - | `valid_end_date` | `CONCEPT->valid_end_date` | - | `invalid_reason` | `CONCEPT->invalid_reason` | - | `created` | `N/A` | - | `modified` | `N/A` | - - .. _OMOP: https://www.ohdsi.org/data-standardization/ - .. _OHDSI Single-page docs: https://www.ohdsi.org/web/wiki/doku.php?id=documentation:cdm:single-page - """ - id = models.BigAutoField(auto_created=True, primary_key=True) - code = models.CharField(max_length=64, null=True, blank=True, unique=True, default='') - description = models.CharField(max_length=256, null=True, blank=True, default='') - is_code = models.BooleanField(null=False, default=True) - is_valid = models.BooleanField(null=False, default=True) - standard_concept = models.CharField( - # 'S', 'C' or NULL - null=True, - blank=True, - choices=StandardFlag.choices, - default=None, - max_length=1, - ) - coding_name = models.CharField(max_length=256, null=True, blank=True, default='') - coding_system = models.ForeignKey( - 'clinicalcode.CodingSystem', - on_delete=models.SET_NULL, - null=True, - blank=True, - default=None, - related_name='omop_code' - ) - domain_name = models.CharField(max_length=256, null=True, blank=True, default='') - class_name = models.CharField(max_length=256, null=True, blank=True, default='') - vocabulary_name = models.CharField(max_length=64, null=True, blank=True, default='') - vocabulary_code = models.CharField(max_length=64, null=True, blank=True, default='') - vocabulary_version = models.CharField(max_length=256, null=True, blank=True, default='') - valid_start_date = models.DateField(null=True, blank=True) - valid_end_date = models.DateField(null=True, blank=True) - invalid_reason = models.CharField( - # 'D', 'U' or NULL - null=True, - blank=True, - choices=InvalidFlag.choices, - default=None, - max_length=1, - ) - created = models.DateTimeField(auto_now_add=True, editable=True) - modified = models.DateTimeField(auto_now_add=True, editable=True) - - class Meta: - ordering = ('id',) - indexes = [ - models.Index(fields=['id']), - models.Index(fields=['created']), - GinIndex( - name='omop_cd_trgm_idx', - fields=['code'], - opclasses=['gin_trgm_ops'] - ), - GinIndex( - name='omop_desc_trgm_idx', - fields=['description'], - opclasses=['gin_trgm_ops'] - ), - GinIndex( - name='omop_cs_trgm_idx', - fields=['coding_name'], - opclasses=['gin_trgm_ops'] - ), - GinIndex( - name='omop_vscd_trgm_idx', - fields=['vocabulary_code'], - opclasses=['gin_trgm_ops'] - ), - ] + """ + Represents a standardised `OMOP`_ code from its Common Data Model, related to :model:`clinicalcode.CodingSystem`. + + Version + ------- + Vocabulary version: `v20250827` + + Reference + --------- + See `OHDSI Single-page docs`_. + + Mapping + ------- + | Attribute | Table->Column | + |:---------------------|:---------------------------------| + | `code` | `CONCEPT->concept_id` | + | `description` | `CONCEPT->concept_name` | + | `is_code` | `N/A` | + | `is_valid` | `N/A` | + | `standard_concept` | `CONCEPT->standard_concept` | + | `coding_name` | `N/A` | + | `coding_system_id` | `N/A` | + | `domain_name` | `CONCEPT->domain_id` | + | `class_name` | `CONCEPT->concept_class_id` | + | `vocabulary_name` | `CONCEPT->vocabulary_id` | + | `vocabulary_code` | `CONCEPT->concept_code` | + | `vocabulary_version` | `VOCABULARY->vocabulary_version` | + | `valid_start_date` | `CONCEPT->valid_start_date` | + | `valid_end_date` | `CONCEPT->valid_end_date` | + | `invalid_reason` | `CONCEPT->invalid_reason` | + | `created` | `N/A` | + | `modified` | `N/A` | + + .. _OMOP: https://www.ohdsi.org/data-standardization/ + .. _OHDSI Single-page docs: https://www.ohdsi.org/web/wiki/doku.php?id=documentation:cdm:single-page + """ + id = models.BigAutoField(auto_created=True, primary_key=True) + code = models.CharField(max_length=64, null=True, blank=True, unique=True, default='') + description = models.CharField(max_length=256, null=True, blank=True, default='') + is_code = models.BooleanField(null=False, default=True) + is_valid = models.BooleanField(null=False, default=True) + standard_concept = models.CharField( + # 'S', 'C' or NULL + null=True, + blank=True, + choices=StandardFlag.choices, + default=None, + max_length=1, + ) + coding_name = models.CharField(max_length=256, null=True, blank=True, default='') + coding_system = models.ForeignKey( + 'clinicalcode.CodingSystem', + on_delete=models.SET_NULL, + null=True, + blank=True, + default=None, + related_name='omop_code' + ) + domain_name = models.CharField(max_length=256, null=True, blank=True, default='') + class_name = models.CharField(max_length=256, null=True, blank=True, default='') + vocabulary_name = models.CharField(max_length=64, null=True, blank=True, default='') + vocabulary_code = models.CharField(max_length=64, null=True, blank=True, default='') + vocabulary_version = models.CharField(max_length=256, null=True, blank=True, default='') + valid_start_date = models.DateField(blank=True, null=True) + valid_end_date = models.DateField(blank=True, default=datetime.date(2099,12,31)) + invalid_reason = models.CharField( + # 'D', 'U' or NULL + null=True, + blank=True, + choices=InvalidFlag.choices, + default=None, + max_length=1, + ) + created = models.DateTimeField(auto_now_add=True, editable=True) + modified = models.DateTimeField(auto_now_add=True, editable=True) + + class Meta: + ordering = ('id',) + indexes = [ + models.Index(fields=['id']), + models.Index(fields=['created']), + GinIndex( + name='omop_cd_trgm_idx', + fields=['code'], + opclasses=['gin_trgm_ops'] + ), + GinIndex( + name='omop_desc_trgm_idx', + fields=['description'], + opclasses=['gin_trgm_ops'] + ), + GinIndex( + name='omop_cs_trgm_idx', + fields=['coding_name'], + opclasses=['gin_trgm_ops'] + ), + GinIndex( + name='omop_vscd_trgm_idx', + fields=['vocabulary_code'], + opclasses=['gin_trgm_ops'] + ), + ] + + +class OMOPRelationships(models.Model): + """ + Represents the edges, or `relationships`_, between OMOP concepts stored in :model:`clinicalcode.OMOP_CODES`. + + Version + ------- + Vocabulary version: `v20250827` + + Reference + --------- + See: + - `Concept Relationship Data Model Conventions`_. + - `CONCEPT_RELATIONSHIP table`_. + + Mapping + ------- + | Attribute | Table->Column | + |:---------------------|:-----------------------------------------| + | `code0_id` | `CONCEPT_RELATIONSHIP->concept_id_1` | + | `code1_id` | `CONCEPT_RELATIONSHIP->concept_id_2` | + | `relationship` | `CONCEPT_RELATIONSHIP->relationship_id` | + | `valid_start_date` | `CONCEPT_RELATIONSHIP->valid_start_date` | + | `valid_end_date` | `CONCEPT_RELATIONSHIP->valid_end_date` | + | `invalid_reason` | `CONCEPT_RELATIONSHIP->invalid_reason` | + + .. _relationships: https://www.ohdsi.org/web/wiki/doku.php?id=documentation:cdm:concept_relationship + .. _Concept Relationship Data Model Conventions: https://ohdsi.github.io/CommonDataModel/dataModelConventions.html#Concept_Relationships + .. _CONCEPT_RELATIONSHIP table: https://www.ohdsi.org/web/wiki/doku.php?id=documentation:cdm:concept_relationship + """ + id = models.BigAutoField(auto_created=True, primary_key=True) + code0 = models.ForeignKey( + to='OMOP_CODES', + to_field='code', + related_name=f'source_relationships', + on_delete=models.CASCADE, + ) + code1 = models.ForeignKey( + to='OMOP_CODES', + to_field='code', + related_name=f'destination_relationships', + on_delete=models.CASCADE, + ) + relationship = models.CharField(max_length=64) + valid_start_date = models.DateField(blank=True, auto_now_add=True) + valid_end_date = models.DateField(blank=True, default=datetime.date(2099,12,31)) + invalid_reason = models.CharField( + # 'D', 'U' or NULL + null=True, + blank=True, + choices=InvalidFlag.choices, + default=None, + max_length=1, + ) + + class Meta: + unique_together = ('code0', 'code1', 'relationship',) + indexes = [ + models.Index(fields=['code0']), + models.Index(fields=['code1']), + models.Index(fields=['code0', 'code1']), + models.Index(fields=['code0', 'relationship', 'invalid_reason']), + GinIndex( + name='omop_e0_trgm_idx', + fields=['code0'], + opclasses=['gin_trgm_ops'] + ), + GinIndex( + name='omop_e0t_tgbt_idx', + fields=['code0', 'relationship', 'invalid_reason'], + opclasses=['', 'gin_trgm_ops', ''] + ), + ] diff --git a/CodeListLibrary_project/clinicalcode/models/__init__.py b/CodeListLibrary_project/clinicalcode/models/__init__.py index 2cfad7199..0c69dc743 100644 --- a/CodeListLibrary_project/clinicalcode/models/__init__.py +++ b/CodeListLibrary_project/clinicalcode/models/__init__.py @@ -56,7 +56,7 @@ from .ATCDDD_CODES import ATCDDD_CODES from .ICD10CA_CODES import ICD10CA_CODES from .ICD10CM_CODES import ICD10CM_CODES -from .OMOP_CODES import OMOP_CODES +from .OMOP_CODES import OMOP_CODES, OMOPRelationships # need to restore EMIS/Vision when deploy. to prod. #from .EMIS_CODES import EMIS_CODES diff --git a/docs/omop/README.md b/docs/omop/README.md new file mode 100644 index 000000000..7ce6dc17d --- /dev/null +++ b/docs/omop/README.md @@ -0,0 +1,171 @@ +# Investigation: OMOP Transformation + +> [!TIP] +> Content retrievable from [Athena](https://athena.ohdsi.org/vocabulary/list). + +Bundle Composition: + +| ID | CDM | Name | Code (cdm v5) | +|----|-------|--------------------------------------------------------------------------------------------------------|---------------| +| 1 | CDM 5 | Systematic Nomenclature of Medicine - Clinical Terms (IHTSDO) | SNOMED | +| 2 | CDM 5 | International Classification of Diseases, Ninth Revision, Clinical Modification, Volume 1 and 2 (NCHS) | ICD9CM | +| 3 | CDM 5 | International Classification of Diseases, Ninth Revision, Clinical Modification, Volume 3 (NCHS) | ICD9Proc | +| 17 | CDM 5 | NHS UK Read Codes Version 2 (HSCIC) | Read | +| 18 | CDM 5 | Oxford Medical Information System (OCHP) | OXMIS | +| 55 | CDM 5 | OPCS Classification of Interventions and Procedures version 4 (NHS) | OPCS4 | +| 70 | CDM 5 | International Classification of Diseases, Tenth Revision, Clinical Modification (NCHS) | ICD10CM | + +Resultset: + +| Total Phenotypes | Total Mapped | Map Rate | Avg Excl. Code Map Rate | Avg Incl. Code Map Rate | +|-----------------:|-------------:|---------:|------------------------:|------------------------:| +| 7,578 | 6,884 | 90.84% | 77.66% | 75.63% | + +## 1. Tables + +> [!TIP] +> See ref @ [OHDSI Docs](https://www.ohdsi.org/web/wiki/doku.php?id=documentation:cdm:single-page) + +> [!CAUTION] +> The following files have undergone transformation from `tsv` to `csv` via PowerShell, _e.g._: +> ```ps +> Import-Csv -Path '.project/data/VOCABULARY.csv' -Delimiter "`t" | Export-Csv -Path '.project/out/VOCABULARY.csv' -Encoding UTF8 -NoTypeInformation +> ``` + +> [!CAUTION] +> Note that `omop.relationships.csv` has modified the header & YYYMMDD date format of `CONCEPT_RELATIONSHIP.csv` using the following: +> ```ps +> Import-CSV ` +> -Path 'CONCEPT_RELATIONSHIP.csv' ` +> -Header "code0_id","code1_id","relationship","valid_start_date","valid_end_date","invalid_reason" ` +> | select -skip 1 ` +> | Foreach-Object { +> $_.'valid_start_date' = $($_.'valid_start_date' -replace '(?\d{4})(?\d{2})(?\d{2})', '${year}-${month}-${day}') +> $_.'valid_end_date' = $($_.'valid_end_date' -replace '(?\d{4})(?\d{2})(?\d{2})', '${year}-${month}-${day}') +> $_ +> } ` +> | Export-CSV -Path 'omop.relationships.csv' -Encoding UTF8 -NoTypeInformation +> ``` + +### 1.1. Base Tables + +#### `data/CONCEPT.csv` + +| Field | Required | Type | Description | +|--------------------|----------|----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `concept_id` | Yes | `integer` | A unique identifier for each Concept across all domains. | +| `concept_name` | Yes | `varchar(255)` | An unambiguous, meaningful and descriptive name for the Concept. | +| `domain_id` | Yes | `varchar(20)` | A foreign key to the DOMAIN table the Concept belongs to. | +| `vocabulary_id` | Yes | `varchar(20)` | A foreign key to the VOCABULARY table indicating from which source the Concept has been adapted. | +| `concept_class_id` | Yes | `varchar(20)` | The attribute or concept class of the Concept. Examples are “Clinical Drug”, “Ingredient”, “Clinical Finding” etc. | +| `standard_concept` | No | `varchar(1)` | This flag determines where a Concept is a Standard Concept, i.e. is used in the data, a Classification Concept, or a non-standard Source Concept. The allowables values are 'S' (Standard Concept) and 'C' (Classification Concept), otherwise the content is NULL. | +| `concept_code` | Yes | `varchar(50)` | The concept code represents the identifier of the Concept in the source vocabulary, such as SNOMED-CT concept IDs, RxNorm RXCUIs etc. Note that concept codes are not unique across vocabularies. | +| `valid_start_date` | Yes | `date` | The date when the Concept was first recorded. The default value is 1-Jan-1970, meaning, the Concept has no (known) date of inception. | +| `valid_end_date` | Yes | `date` | The date when the Concept became invalid because it was deleted or superseded (updated) by a new concept. The default value is 31-Dec-2099, meaning, the Concept is valid until it becomes deprecated. | +| `invalid_reason` | No | `varchar(1)` | Reason the Concept was invalidated. Possible values are D (deleted), U (replaced with an update) or NULL when valid_end_date has the default value. | + +#### `data/DOMAIN.csv` + +| Field | Required | Type | Description | +|---------------------|----------|----------------|-----------------------------------------------------------------------------------------------------------------------------| +| `domain_id` | Yes | `varchar(20)` | A unique key for each domain. | +| `domain_name` | Yes | `varchar(255)` | The name describing the Domain, e.g. “Condition”, “Procedure”, “Measurement” etc. | +| `domain_concept_id` | Yes | `integer` | A foreign key that refers to an identifier in the CONCEPT table for the unique Domain Concept the Domain record belongs to. | + +#### `data/VOCABULARY.csv` + +| Field | Required | Type | Description | +|-------------------------|----------|----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `vocabulary_id` | Yes | `varchar(20)` | A unique identifier for each Vocabulary, such as ICD9CM, SNOMED, Visit. | +| `vocabulary_name` | Yes | `varchar(255)` | The name describing the vocabulary, for example “International Classification of Diseases, Ninth Revision, Clinical Modification, Volume 1 and 2 (NCHS)” etc. | +| `vocabulary_reference` | Yes | `varchar(255)` | External reference to documentation or available download of the about the vocabulary. | +| `vocabulary_version` | Yes | `varchar(255)` | Version of the Vocabulary as indicated in the source. | +| `vocabulary_concept_id` | Yes | `integer` | A foreign key that refers to a standard concept identifier in the CONCEPT table for the Vocabulary the VOCABULARY record belongs to. | + +#### `data/RELATIONSHIP.csv` + +| Field | Required | Type | Description | +|---------------------------|----------|----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `relationship_id` | Yes | `varchar(20)` | The type of relationship captured by the relationship record. | +| `relationship_name` | Yes | `varchar(255)` | The text that describes the relationship type. | +| `is_hierarchical` | Yes | `varchar(1)` | Defines whether a relationship defines concepts into classes or hierarchies. Values are 1 for hierarchical relationship or 0 if not. | +| `defines_ancestry` | Yes | `varchar(1)` | Defines whether a hierarchical relationship contributes to the concept_ancestor table. These are subsets of the hierarchical relationships. Valid values are 1 or 0. | +| `reverse_relationship_id` | Yes | `varchar(20)` | The identifier for the relationship used to define the reverse relationship between two concepts. | +| `relationship_concept_id` | Yes | `integer` | A foreign key that refers to an identifier in the CONCEPT table for the unique relationship concept. | + +### 1.2. Concept Tables + +#### `data/CONCEPT_CLASS.csv` + +| Field | Required | Type | Description | +|----------------------------|----------|----------------|---------------------------------------------------------------------------------------------------------------------| +| `concept_class_id` | Yes | `varchar(20)` | A unique key for each class. | +| `concept_class_name` | Yes | `varchar(255)` | The name describing the Concept Class, e.g. “Clinical Finding”, “Ingredient”, etc. | +| `concept_class_concept_id` | Yes | `integer` | A foreign key that refers to an identifier in the CONCEPT table for the unique Concept Class the record belongs to. | + +#### `data/CONCEPT_SYNONYM.csv` + +| Field | Required | Type | Description | +|------------------------|----------|-----------------|-------------------------------------------------------| +| `concept_id` | Yes | `integer` | A foreign key to the Concept in the CONCEPT table. | +| `concept_synonym_name` | Yes | `varchar(1000)` | The alternative name for the Concept. | +| `language_concept_id` | Yes | `integer` | A foreign key to a Concept representing the language. | + +#### `data/CONCEPT_RELATIONSHIP.csv` + +| Field | Required | Type | Description | +|--------------------|----------|---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `concept_id_1` | Yes | `integer` | A foreign key to a Concept in the CONCEPT table associated with the relationship. Relationships are directional, and this field represents the source concept designation. | +| `concept_id_2` | Yes | `integer` | A foreign key to a Concept in the CONCEPT table associated with the relationship. Relationships are directional, and this field represents the destination concept designation. | +| `relationship_id` | Yes | `varchar(20)` | A unique identifier to the type or nature of the Relationship as defined in the RELATIONSHIP table. | +| `valid_start_date` | Yes | `date` | The date when the instance of the Concept Relationship is first recorded. | +| `valid_end_date` | Yes | `date` | The date when the Concept Relationship became invalid because it was deleted or superseded (updated) by a new relationship. Default value is 31-Dec-2099. | +| `invalid_reason` | No | `varchar(1)` | Reason the relationship was invalidated. Possible values are 'D' (deleted), 'U' (replaced with an update) or NULL when valid_end_date has the default value. | + +#### `data/CONCEPT_ANCESTOR.csv` + +| Field | Required | Type | Description | +|----------------------------|----------|-----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `ancestor_concept_id` | Yes | `integer` | A foreign key to the concept in the concept table for the higher-level concept that forms the ancestor in the relationship. | +| `descendant_concept_id` | Yes | `integer` | A foreign key to the concept in the concept table for the lower-level concept that forms the descendant in the relationship. | +| `min_levels_of_separation` | Yes | `integer` | The minimum separation in number of levels of hierarchy between ancestor and descendant concepts. This is an attribute that is used to simplify hierarchic analysis. | +| `max_levels_of_separation` | Yes | `integer` | The maximum separation in number of levels of hierarchy between ancestor and descendant concepts. This is an attribute that is used to simplify hierarchic analysis. | + +## 2. Source to Concept Map + +> [!TIP] +> See ref @ [`source_to_concept_map`](https://www.ohdsi.org/web/wiki/doku.php?id=documentation:cdm:source_to_concept_map) + +### 2.1. Data Model + +| Field | Required | Type | Description | +|---------------------------|----------|----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `source_code` | Yes | `varchar(50)` | The source code being translated into a Standard Concept. | +| `source_concept_id` | Yes | `integer` | A foreign key to the Source Concept that is being translated into a Standard Concept. | +| `source_vocabulary_id` | No | `varchar(20)` | A foreign key to the VOCABULARY table defining the vocabulary of the source code that is being translated to a Standard Concept. | +| `source_code_description` | Yes | `varchar(255)` | An optional description for the source code. This is included as a convenience to compare the description of the source code to the name of the concept. | +| `target_concept_id` | Yes | `integer` | A foreign key to the target Concept to which the source code is being mapped. | +| `target_vocabulary_id` | Yes | `varchar(20)` | A foreign key to the VOCABULARY table defining the vocabulary of the target Concept. | +| `valid_start_date` | Yes | `date` | The date when the mapping instance was first recorded. | +| `valid_end_date` | Yes | `date` | The date when the mapping instance became invalid because it was deleted or superseded (updated) by a new relationship. Default value is 31-Dec-2099. | +| `invalid_reason` | No | `varchar(1)` | Reason the mapping instance was invalidated. Possible values are D (deleted), U (replaced with an update) or NULL when valid_end_date has the default value. | + +### 2.2. Alternative Mapping + +#### 2.2.1. ICD + +> [!TIP] +> 1. Hover `Info` tab +> 2. Click `ICD-10 / ICD-11 mapping tables` to download + +Download: [ICD-10 / ICD-11 mapping tables](https://icd.who.int/browse/2025-01/mms/en) + +#### 2.2.2. BNF via dm+d + +Download: [BNF to dm+d](https://www.bennett.ox.ac.uk/blog/2023/11/bnf-to-dictionary-of-medicines-and-devices-dm-d-map-now-available/) + +#### 2.2.3. Read Code V3 via SNOMED + +Download: +- [ReadV2<->V3 crossmap](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/9/items/255/releases) +- [ReadV3<->SNOMED crossmap](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/38/items/270/) diff --git a/docs/omop/py/build_codelist.py b/docs/omop/py/build_codelist.py new file mode 100644 index 000000000..ca641aa6e --- /dev/null +++ b/docs/omop/py/build_codelist.py @@ -0,0 +1,101 @@ +import polars as pl + +# NOTE: +# [!] Use the vocabulary found within `.resources/bundles/Vocab Bundle -- vocabulary_download_v5_{ba86ab8a-c6a5-4bc8-9961-495466f10c05}_1768387228226.zip` + +# NOTE: +# [!] Vocab version: v20250827 +# [!] Athena version: 1.15.5.56.250901.1005 + +if __name__ == '__main__': + schema = pl.Schema({ + 'concept_id': pl.String(), + 'concept_name': pl.String(), + 'domain_id': pl.String(), + 'vocabulary_id': pl.String(), + 'concept_class_id': pl.String(), + 'standard_concept': pl.String(), + 'concept_code': pl.String(), + 'valid_start_date': pl.String(), + 'valid_end_date': pl.String(), + 'invalid_reason': pl.String(), + }) + with pl.SQLContext( + concepts=pl.scan_csv( + source='./packages/datasources/CONCEPT.csv', + schema=schema, + has_header=True, + ), + vocabulary=pl.scan_csv( + source='./packages/datasources/VOCABULARY.csv', + has_header=True, + ), + eager=True, + ) as ctx: + query = ''' + select + concept_id as code, + concept_name as description, + (vocabulary_id != 'CDM') as is_code, + (invalid_reason is null or invalid_reason = '') as is_valid, + case + when standard_concept is not null and standard_concept != '' then standard_concept + else null + end as standard_concept, + case + when vocabulary_id = 'Read' then 'Read codes v2' + when vocabulary_id = 'dm+d' then 'dm+d codes' + when vocabulary_id = 'MeSH' then 'MeSH codes' + when vocabulary_id = 'OPCS4' then 'OPCS4 codes' + when vocabulary_id = 'OXMIS' then 'OXMIS codes' + when vocabulary_id = 'SNOMED' then 'SNOMED codes' + when vocabulary_id = 'UK Biobank' then 'UKBioBank codes' + when vocabulary_id = 'ICD9CM' then 'ICD9 codes' + when vocabulary_id = 'ICD9Proc' then 'ICD9 codes' + when vocabulary_id = 'ICD10' then 'ICD-10-CM codes' + when vocabulary_id = 'ICD10CM' then 'ICD-10-CM codes' + when vocabulary_id = 'ICD10PCS' then 'ICD-10-CM codes' + else vocabulary_id + end as coding_name, + case + when vocabulary_id = 'Read' then 5 + when vocabulary_id = 'dm+d' then 23 + when vocabulary_id = 'MeSH' then 26 + when vocabulary_id = 'OPCS4' then 7 + when vocabulary_id = 'OXMIS' then 15 + when vocabulary_id = 'SNOMED' then 9 + when vocabulary_id = 'UK Biobank' then 12 + when vocabulary_id = 'ICD9CM' then 17 + when vocabulary_id = 'ICD9Proc' then 17 + when vocabulary_id = 'ICD10' then 25 + when vocabulary_id = 'ICD10CM' then 25 + when vocabulary_id = 'ICD10PCS' then 25 + else null + end as coding_system_id, + domain_id as domain_name, + concept_class_id as class_name, + vocabulary_id as vocabulary_name, + concept_code as vocabulary_code, + vocabulary.vocabulary_version as vocabulary_version, + case + when valid_start_date is not null then date(valid_start_date, '%Y%m%d') + else null + end as valid_start_date, + case + when valid_end_date is not null and valid_end_date != '20991231' then date(valid_end_date, '%Y%m%d') + else null + end as valid_end_date, + case + when invalid_reason is not null and invalid_reason != '' then invalid_reason + else null + end as invalid_reason + from concepts + join vocabulary + using (vocabulary_id) + ''' + res = ctx.execute(query) + res.write_csv(file='./omop.vocabulary.csv') + + with pl.Config(tbl_cols=-1): + print(res) + print('Row Count:', res.select(pl.len())) diff --git a/docs/omop/py/examine_mapping.py b/docs/omop/py/examine_mapping.py new file mode 100644 index 000000000..5dc664831 --- /dev/null +++ b/docs/omop/py/examine_mapping.py @@ -0,0 +1,305 @@ +from glob import glob +from enum import StrEnum + +import os +import re +import polars as pl + +# NOTE: +# [!] Use the vocabulary found within `.resources/bundles/Concept Bundle -- vocabulary_download_v5_{2c1bc99a-c11f-4bc7-a096-8fefe0ce251d}_1768389516543.zip` + +# NOTE: +# [!] Vocab version: v20250827 +# [!] Athena version: 1.15.5.56.250901.1005 + +class Paths(StrEnum): + Data = './packages/datasources' + Mapping = './.project/mappings' + Phenotypes = './.project/phenotypes' + +class Config(object): + cs_names = pl.Enum([ + 'ICD9 codes', + 'ICD10 codes', + 'OXMIS codes', + 'OPCS4 codes', + 'Read codes v2', + 'SNOMED CT codes', + ]) + + cs_groups = { + 'ICD9 codes': ['ICD9CM', 'ICD9Proc'], + 'ICD10 codes': ['ICD10', 'ICD10CM', 'ICD10GM', 'ICD10PCS'], + 'OXMIS codes': ['OXMIS', 'Read'], + 'OPCS4 codes': ['OPCS4'], + 'Read codes v2': ['Read'], + 'SNOMED CT codes': ['SNOMED'], + } + + @classmethod + def resolve_pheno(cls, query, as_list=False, default=None): + item = re.match(r'^PH(\d+)/(\d+)$', query) + if item is not None: + fname = 'PH%s_ver_%s.csv' % (item.group(1), item.group(2)) + return [fname] if as_list else fname + return default + + @classmethod + def build_path(cls, root, *args): + base = None + if isinstance(root, Paths): + root = root.value + + match root: + case 'data': + base = Paths.Data.value + case 'mapping': + base = Paths.Mapping.value + case 'phenotypes': + if len(args) == 2: + chn, trg, *_ = args + trg = Config.resolve_pheno(trg, as_list=True) + if isinstance(trg, list): + base = os.path.join(Paths.Phenotypes.value, chn) + args = trg + + if base is None: + base = root + case 'phenotypes/multi': + base = os.path.join(Paths.Phenotypes.value, 'multi') + args = Config.resolve_pheno(args[0], as_list=True, default=args) + case 'phenotypes/single': + base = os.path.join(Paths.Phenotypes.value, 'single') + args = Config.resolve_pheno(args[0], as_list=True, default=args) + case _: + base = root + + return os.path.normpath(os.path.join(base, *args)) + +def build_mappings(target='single'): + ph_schema = pl.Schema({ + 'code': pl.String(), + 'description': pl.String(), + 'coding_system': Config.cs_names, + 'concept_id': pl.String(), + 'concept_version_id': pl.Int32(), + 'concept_name': pl.String(), + 'phenotype_id': pl.String(), + 'phenotype_version_id': pl.Int32(), + 'phenotype_name': pl.String(), + }) + + concepts = pl.scan_csv( + source=Config.build_path('data', 'CONCEPT.csv'), + has_header=True, + ) + + relationships = pl.scan_csv( + source=Config.build_path('data', 'CONCEPT_RELATIONSHIP.csv'), + has_header=True, + ) + + for f in glob(Config.build_path(f'phenotypes/{target}', '*.csv')): + pheno = pl.scan_csv( + source=f, + schema=ph_schema, + has_header=True, + ) \ + .collect() \ + .with_columns( + pl.col('coding_system') \ + .replace_strict(Config.cs_groups) \ + .alias('system_mapping') + ) + + phenoid = pheno.select( + pl.first('phenotype_id', 'phenotype_version_id') + ) \ + .map_rows(lambda x: '%s/%d' % x) \ + .to_series()[0] + + systems = pl.Series( + pheno \ + .select( + pl.col('system_mapping') \ + .flatten() \ + .unique() + ) + ) + systems = ','.join(['\'%s\'' % (x,) for x in systems.to_list()]) + + with pl.SQLContext( + concepts=concepts, + phenotype=pheno, + relationships=relationships, + eager=True, + ) as ctx: + """ + New w/ origin + coalesce + """ + query = ''' + with + slim_concepts as ( + select * + from concepts + where vocabulary_id in (%(vocabs)s) + ), + mapped as ( + select + slim_concepts.concept_id, + slim_concepts.concept_code, + slim_concepts.concept_name, + slim_concepts.vocabulary_id, + slim_concepts.invalid_reason, + phenotype.code as origin_code, + phenotype.coding_system as origin_system + from phenotype + join slim_concepts + on phenotype.code = slim_concepts.concept_code + where array_contains(phenotype.system_mapping, slim_concepts.vocabulary_id) + ), + standard as ( + select + cc.concept_id, + cc.concept_code, + cc.concept_name, + cc.vocabulary_id, + cc.invalid_reason, + mapped.origin_code, + mapped.origin_system, + mapped.concept_id as replacement_id + from mapped + join relationships + on mapped.concept_id = relationships.concept_id_1 + join concepts as cc + on relationships.concept_id_2 = cc.concept_id + where cc.vocabulary_id = 'SNOMED' + and cc.invalid_reason = '' + and relationships.relationship_id = 'Maps to' + and relationships.invalid_reason = '' + ), + upgradable as ( + select + standard.concept_id, + standard.concept_code, + standard.concept_name, + standard.vocabulary_id, + standard.invalid_reason, + standard.origin_code, + standard.origin_system + from standard + union all + select mapped.* + from mapped + left join standard + on mapped.concept_id = standard.replacement_id + and mapped.origin_code = standard.origin_code + where standard.concept_id is null + ), + replacements as ( + select + conc.concept_id, + conc.concept_code, + conc.concept_name, + conc.vocabulary_id, + conc.invalid_reason, + upgradable.origin_code, + upgradable.origin_system, + upgradable.concept_id as replacement_id + from upgradable + join relationships + on upgradable.concept_id = relationships.concept_id_1 + join concepts as conc + on relationships.concept_id_2 = conc.concept_id + where regexp_like(upgradable.invalid_reason, '^(U|D)$') + and regexp_like(relationships.relationship_id, '^.*(replaced by|was_a from)') + and conc.invalid_reason = '' + and relationships.invalid_reason = '' + ), + upgraded as ( + select + replacements.concept_id, + replacements.concept_code, + replacements.concept_name, + replacements.vocabulary_id, + replacements.invalid_reason, + replacements.origin_code, + replacements.origin_system + from replacements + union all + select upgradable.* + from upgradable + left join replacements + on upgradable.concept_id = replacements.replacement_id + and upgradable.origin_code = replacements.origin_code + where replacements.concept_id is null + ) + select * + from upgraded; + ''' % { + 'vocabs': systems, + } + + codelist = ctx.execute(query) + ctx.register('codelist', codelist) + + codelist.write_csv( + file=Config.build_path(Paths.Phenotypes, f'out-{target}', 'codelist-%s' % (Config.resolve_pheno(phenoid))), + quote_style='necessary' + ) + + query = ''' + select distinct on (phenotype.code) + phenotype.phenotype_id, + phenotype.phenotype_version_id, + phenotype.code, + phenotype.coding_system, + phenotype.description + from phenotype + left join codelist + on phenotype.code = codelist.origin_code + and phenotype.coding_system = codelist.origin_system + where codelist.concept_id is null + ''' + + ctx \ + .execute(query) \ + .write_csv( + file=Config.build_path(Paths.Phenotypes, f'out-{target}', 'missing-%s' % (Config.resolve_pheno(phenoid))), + quote_style='necessary' + ) + + query = ''' + select distinct on (codelist.concept_id) + codelist.concept_id, + codelist.concept_code, + codelist.concept_name, + codelist.vocabulary_id, + codelist.invalid_reason, + ('{' || array_to_string(t.origin_codes, ', ') || '}') as origin_codes, + ('{' || array_to_string(t.origin_systems, ', ') || '}') as origin_systems + from codelist + join ( + select f.concept_id, + array_agg(distinct ('"' || f.origin_code || '"')) as origin_codes, + array_agg(distinct ('"' || f.origin_system || '"')) as origin_systems + from codelist as f + group by f.concept_id + ) t + on t.concept_id = codelist.concept_id + window w + as (partition by concept_id) + qualify row_number() + over w = 1; + ''' + + ctx \ + .execute(query) \ + .write_csv( + file=Config.build_path('phenotypes', f'out-{target}', phenoid), + quote_style='necessary' + ) + +if __name__ == '__main__': + build_mappings(target='single') + build_mappings(target='multi') diff --git a/docs/omop/sql/build.codelist.sql b/docs/omop/sql/build.codelist.sql new file mode 100644 index 000000000..fbac5c679 --- /dev/null +++ b/docs/omop/sql/build.codelist.sql @@ -0,0 +1,209 @@ +do $tx$ +begin + --[!] Drop table if exists + if exists( + select 1 + from information_schema.tables + where table_schema = 'public' + and table_name in ('entity_codelists') + ) then + drop table if exists public.entity_codelists cascade; + end if; + + --[!] Create temp tables + raise notice '[Codelist::] Table creation'; + + create temp table components ( + phenotype_id varchar(50) not null, + phenotype_version_id bigint not null, + phenotype_name varchar(255) not null, + concept_id bigint not null, + concept_version_id bigint not null, + concept_history_date timestamptz not null, + concept_name varchar(255) not null, + component_id bigint not null, + component_history_id bigint not null, + logical_type integer not null, + codelist_id bigint not null, + codelist_history_id bigint not null, + coding_system_id bigint not null, + coding_system_name varchar(255) not null, + code_id bigint not null, + code varchar(255) not null, + description text not null default '' + ); + + --[!] Create output table + create unlogged table public.entity_codelists ( + id bigint not null, + phenotype_id varchar(50) not null, + phenotype_version_id bigint not null, + phenotype_name varchar(255) not null, + concept_id bigint not null, + concept_version_id bigint not null, + concept_name varchar(255) not null, + coding_system_id bigint not null, + coding_system_name varchar(255) not null, + code varchar(255) not null, + description text not null default '' + ); + + raise notice '[Codelist::] Table creation'; + + --[!] Build component list + raise notice '[Codelist::] Component creation'; + + with entities as ( + select entity.id as id, + entity.history_id as version_id, + entity.name, + cast(concepts->>'concept_id' as integer) as concept_id, + cast(concepts->>'concept_version_id' as integer) as concept_version_id + from public.clinicalcode_historicalgenericentity as entity, + json_array_elements(entity.template_data::json->'concept_information') as concepts + where (entity.is_deleted is null or entity.is_deleted = false) + and entity.template_data::jsonb ? 'concept_information' + and json_typeof(entity.template_data::json->'concept_information') = 'array' + and json_array_length(entity.template_data::json->'concept_information') > 0 + ) + insert into components ( + phenotype_id, + phenotype_version_id, + phenotype_name, + concept_id, + concept_version_id, + concept_history_date, + concept_name, + component_id, + component_history_id, + logical_type, + codelist_id, + codelist_history_id, + coding_system_id, + coding_system_name, + code_id, + code, + description + ) + select entity.id as phenotype_id, + entity.version_id as phenotype_version_id, + entity.name as phenotype_name, + concept.id as concept_id, + max(concept.history_id) as concept_version_id, + concept.history_date as concept_history_date, + concept.name as concept_name, + component.id as component_id, + max(component.history_id) as component_history_id, + component.logical_type as logical_type, + codelist.id as codelist_id, + max(codelist.history_id) as codelist_history_id, + concept.coding_system_id as coding_system_id, + coding.name as coding_system_name, + codes.id as code_id, + codes.code, + codes.description + from entities as entity + join public.clinicalcode_historicalconcept as concept + on concept.id = entity.concept_id + and concept.history_id = entity.concept_version_id + join public.clinicalcode_codingsystem as coding + on coding.id = concept.coding_system_id + join public.clinicalcode_historicalcomponent as component + on component.concept_id = concept.id + and component.history_date <= concept.history_date + and component.history_type <> '-' + left join public.clinicalcode_historicalcomponent as deleted_component + on deleted_component.concept_id = concept.id + and deleted_component.id = component.id + and deleted_component.history_date <= concept.history_date + and deleted_component.history_type = '-' + join public.clinicalcode_historicalcodelist as codelist + on codelist.component_id = component.id + and codelist.history_date <= concept.history_date + and codelist.history_type <> '-' + join public.clinicalcode_historicalcode as codes + on codes.code_list_id = codelist.id + and codes.history_date <= concept.history_date + and codes.history_type <> '-' + where deleted_component.id is null + group by entity.id, + entity.version_id, + entity.name, + concept.id, + concept.history_id, + concept.history_date, + concept.name, + component.id, + component.logical_type, + codelist.id, + concept.coding_system_id, + coding.name, + codes.id, + codes.code, + codes.description; + + raise notice '[Codelist::] Component creation'; + + --[!] Build codelist + raise notice '[Codelist::] Codelist creation'; + + with codesets as ( + select included_codes.*, + row_number() over ( + partition by included_codes.phenotype_id, + included_codes.phenotype_version_id, + included_codes.concept_id, + included_codes.concept_version_id, + included_codes.code + order by included_codes.code_id desc + ) as rn + from components as included_codes + left join components as excluded_codes + on excluded_codes.phenotype_id = included_codes.phenotype_id + and excluded_codes.phenotype_version_id = included_codes.phenotype_version_id + and excluded_codes.concept_id = included_codes.concept_version_id + and excluded_codes.concept_version_id = included_codes.concept_version_id + and excluded_codes.code = included_codes.code + and excluded_codes.logical_type = 2 + where included_codes.logical_type = 1 + and excluded_codes.code is null + ) + insert into public.entity_codelists ( + id, + phenotype_id, + phenotype_version_id, + phenotype_name, + concept_id, + concept_version_id, + concept_name, + coding_system_id, + coding_system_name, + code, + description + ) + select row_number() over ( + order by regexp_replace(phenotype_id::text, '[a-zA-Z]+', '')::int asc, + phenotype_version_id asc, + concept_id asc, + concept_version_id asc, + code desc + ) as id, + phenotype_id, + phenotype_version_id, + phenotype_name, + concept_id, + concept_version_id, + concept_name, + coding_system_id, + coding_system_name, + code, + description + from codesets + where rn = 1; + + raise notice '[Codelist::] Codelist creation'; + + --[!] Clean temp tables + drop table if exists components; +end; +$tx$ language plpgsql; diff --git a/docs/omop/sql/build.relationships.sql b/docs/omop/sql/build.relationships.sql new file mode 100644 index 000000000..ecd3fb856 --- /dev/null +++ b/docs/omop/sql/build.relationships.sql @@ -0,0 +1,16 @@ +alter table public.clinicalcode_omoprelationships + set unlogged; + +\copy public.clinicalcode_omoprelationships(code0_id, code1_id, relationship, valid_start_date, valid_end_date, invalid_reason) + from '../path/to/OMOP/packages/DATASO~1/CONCEP~1.CSV' + with ( + FORMAT csv, + DELIMITER ',', + HEADER, + ENCODING 'UTF8', + QUOTE '"', + ESCAPE '"' +); + +alter table public.clinicalcode_omoprelationships + set logged; diff --git a/docs/omop/sql/map.codelist.sql b/docs/omop/sql/map.codelist.sql new file mode 100644 index 000000000..1b1c1842b --- /dev/null +++ b/docs/omop/sql/map.codelist.sql @@ -0,0 +1,644 @@ +do $tx$ +declare + _cursor constant refcursor := '_cursor'; +begin + --[!] Create coding system map lookup + drop table if exists coding_lookup cascade; + + create temp table coding_lookup as + select * + from ( + values + ( 'dm+d'::text, '{23}'::int[], '{23}'::int[]), + ( 'MeSH'::text, '{26}'::int[], '{26}'::int[]), + ( 'ICD9'::text, '{17}'::int[], '{17}'::int[]), + ( 'ICD10'::text, '{4,24,25}'::int[], '{25}'::int[]), + ( 'OPCS4'::text, '{7}'::int[], '{7}'::int[]), + ( 'OXMIS'::text, '{15}'::int[], '{5,15}'::int[]), + ( 'READV2'::text, '{5}'::int[], '{5}'::int[]), + ( 'SNOMED'::text, '{9}'::int[], '{9}'::int[]), + ('UKBioBank'::text, '{12}'::int[], '{12}'::int[]) + ) as t(name, in_coding, out_coding); + + --[!] Create intermediate tables + drop table if exists omop_init_match cascade; + drop table if exists omop_best_match cascade; + + create temp table omop_init_match( + phenotype_id varchar(50) not null, + phenotype_version_id bigint not null, + concept_id int not null, + concept_version_id bigint not null, + omop_id varchar(64) not null, + omop_name varchar(256) not null, + omop_vocabulary varchar(64) not null, + omop_code varchar(64) not null, + invalid_reason varchar(1) default null, + origin_code varchar(64) not null, + origin_system_id int not null, + origin_system_name varchar(64) not null + ); + + create temp table omop_best_match( + phenotype_id varchar(50) not null, + phenotype_version_id bigint not null, + concept_id int not null, + concept_version_id bigint not null, + omop_id varchar(64) not null, + omop_name varchar(256) not null, + omop_vocabulary varchar(64) not null, + omop_code varchar(64) not null, + invalid_reason varchar(1) default null, + origin_code varchar(64) not null, + origin_system_id int not null, + origin_system_name varchar(64) not null + ); + + --[!] Create stat tables + drop table if exists omop_phenotype_info cascade; + drop table if exists omop_concept_info cascade; + + create unlogged table omop_phenotype_info( + phenotype_id varchar(50) not null, + phenotype_version_id bigint not null, + coding_system_ids int[] not null, + coding_system_names text[] not null, + coding_system_count int not null, + mapped_system_count int not null, + mapped_system_rate real not null, -- i.e. % of concepts that use coding systems with OMOP crossmaps + total_size int not null, -- i.e. total size of codelist including those from coding systems without OMOP crossmaps + input_size int not null, -- i.e. size of the codelist from all concepts using coding systems with OMOP crossmaps + output_size int not null, + mapped_count int not null, + mapped_rate real not null, -- i.e. % of mapped excluding the codes from coding systems that do not have OMOP crossmaps + total_rate real not null -- i.e. % of mapped if we include all codes including those that don't have OMOP-mappable coding system(s) + ); + + create unlogged table omop_concept_info( + phenotype_id varchar(50) not null, + phenotype_version_id bigint not null, + concept_id int not null, + concept_version_id bigint not null, + coding_system_id int not null, + coding_system_name varchar(64) not null, + input_size int not null, + output_size int not null, + mapped_count int not null, + mapped_rate real not null, + unmatched_codes text[] not null + ); + + --[!] Create output tables + drop table if exists omop_concept_result cascade; + drop table if exists omop_mapping_result cascade; + + create unlogged table omop_concept_result( + phenotype_id varchar(50) not null, + phenotype_version_id bigint not null, + concept_id int not null, + concept_version_id bigint not null, + omop_id varchar(64) not null, + omop_name varchar(256) not null, + omop_vocabulary varchar(64) not null, + omop_code varchar(64) not null, + invalid_reason varchar(1) default null, + origin_codes text[] not null, + origin_system_ids int[] not null, + origin_system_names text[] not null + ); + + create unlogged table omop_mapping_result( + phenotype_id varchar(50) not null, + phenotype_version_id bigint not null, + omop_id varchar(64) not null, + omop_name varchar(256) not null, + omop_vocabulary varchar(64) not null, + omop_code varchar(64) not null, + invalid_reason varchar(1) default null, + origin_codes text[] not null, + origin_system_ids int[] not null, + origin_system_names text[] not null + ); + + --[!] Build initial mapping + insert + into omop_init_match ( + phenotype_id, + phenotype_version_id, + concept_id, + concept_version_id, + omop_id, + omop_name, + omop_vocabulary, + omop_code, + invalid_reason, + origin_code, + origin_system_id, + origin_system_name + ) + select + phenotype.phenotype_id, + phenotype.phenotype_version_id, + phenotype.concept_id, + phenotype.concept_version_id, + concept.code as omop_id, + concept.description as omop_name, + concept.vocabulary_name as omop_vocabulary, + concept.vocabulary_code as omop_code, + concept.invalid_reason, + phenotype.code as origin_code, + phenotype.coding_system_id as origin_system_id, + phenotype.coding_system_name as origin_system_name + from public.entity_codelists as phenotype + join coding_lookup as lookup + on phenotype.coding_system_id = any(lookup.in_coding) + join public.clinicalcode_omop_codes as concept + on concept.vocabulary_code = phenotype.code + and concept.coding_system_id = any(lookup.out_coding); + + --[!] Build standardised & upgraded mapping + with + standardised as ( + select + mappable.phenotype_id, + mappable.phenotype_version_id, + mappable.concept_id, + mappable.concept_version_id, + concept.code as omop_id, + concept.description as omop_name, + concept.vocabulary_name as omop_vocabulary, + concept.vocabulary_code as omop_code, + concept.invalid_reason, + mappable.origin_code, + mappable.origin_system_id, + mappable.origin_system_name, + mappable.omop_id as replacement_id + from omop_init_match as mappable + join public.clinicalcode_omoprelationships as relationships + on relationships.code0_id = mappable.omop_id + and relationships.relationship = 'Maps to' + and (relationships.invalid_reason is null or relationships.invalid_reason = '') + join public.clinicalcode_omop_codes as concept + on concept.code = relationships.code1_id + and concept.vocabulary_name = 'SNOMED' + and (concept.invalid_reason is null or concept.invalid_reason = '') + ), + upgradable as ( + select + standardised.phenotype_id, + standardised.phenotype_version_id, + standardised.concept_id, + standardised.concept_version_id, + standardised.omop_id, + standardised.omop_name, + standardised.omop_vocabulary, + standardised.omop_code, + standardised.invalid_reason, + standardised.origin_code, + standardised.origin_system_id, + standardised.origin_system_name + from standardised + union all + select mappable.* + from omop_init_match as mappable + left join standardised + on mappable.omop_id = standardised.replacement_id + where standardised.omop_id is null + ), + replacements as ( + select + upgradable.phenotype_id, + upgradable.phenotype_version_id, + upgradable.concept_id, + upgradable.concept_version_id, + concept.code as omop_id, + concept.description as omop_name, + concept.vocabulary_name as omop_vocabulary, + concept.vocabulary_code as omop_code, + concept.invalid_reason, + upgradable.origin_code, + upgradable.origin_system_id, + upgradable.origin_system_name, + upgradable.omop_id as replacement_id + from upgradable + join public.clinicalcode_omoprelationships as relationships + on relationships.code0_id = upgradable.omop_id + and (relationships.invalid_reason is null or relationships.invalid_reason = '') + and regexp_like(relationships.relationship, '.*(Maps to|replaced by|was_a to|alt_to to).*') + join public.clinicalcode_omop_codes as concept + on concept.code = relationships.code1_id + and (concept.invalid_reason is null or concept.invalid_reason = '') + where upgradable.invalid_reason = 'U' + or upgradable.invalid_reason = 'D' + ) + insert + into omop_best_match ( + phenotype_id, + phenotype_version_id, + concept_id, + concept_version_id, + omop_id, + omop_name, + omop_vocabulary, + omop_code, + invalid_reason, + origin_code, + origin_system_id, + origin_system_name + ) + select + replacements.phenotype_id, + replacements.phenotype_version_id, + replacements.concept_id, + replacements.concept_version_id, + replacements.omop_id, + replacements.omop_name, + replacements.omop_vocabulary, + replacements.omop_code, + replacements.invalid_reason, + replacements.origin_code, + replacements.origin_system_id, + replacements.origin_system_name + from replacements + union all + select upgradable.* + from upgradable + left join replacements + on upgradable.omop_id = replacements.replacement_id + where replacements.omop_id is null; + + --[!] Build concept output + with + grouped as ( + select + best.phenotype_id, + best.phenotype_version_id, + best.concept_id, + best.concept_version_id, + best.omop_id, + array_agg(distinct best.origin_code) as origin_codes, + array_agg(distinct best.origin_system_id) as origin_system_ids, + array_agg(distinct best.origin_system_name) as origin_system_names + from omop_best_match as best + group by best.phenotype_id, + best.phenotype_version_id, + best.concept_id, + best.concept_version_id, + best.omop_id + ) + insert + into omop_concept_result ( + phenotype_id, + phenotype_version_id, + concept_id, + concept_version_id, + omop_id, + omop_name, + omop_vocabulary, + omop_code, + invalid_reason, + origin_codes, + origin_system_ids, + origin_system_names + ) + select + grouped.phenotype_id, + grouped.phenotype_version_id, + grouped.concept_id, + grouped.concept_version_id, + concept.code as omop_id, + concept.description as omop_name, + concept.vocabulary_name as omop_vocabulary, + concept.vocabulary_code as omop_code, + concept.invalid_reason, + grouped.origin_codes, + grouped.origin_system_ids, + grouped.origin_system_names + from grouped + join public.clinicalcode_omop_codes as concept + on grouped.omop_id = concept.code; + + --[!] Build final output + with + grouped as ( + select + best.phenotype_id, + best.phenotype_version_id, + best.omop_id, + array_agg(distinct best.origin_code) as origin_codes, + array_agg(distinct best.origin_system_id) as origin_system_ids, + array_agg(distinct best.origin_system_name) as origin_system_names + from omop_best_match as best + group by best.phenotype_id, + best.phenotype_version_id, + best.omop_id + ) + insert + into omop_mapping_result ( + phenotype_id, + phenotype_version_id, + omop_id, + omop_name, + omop_vocabulary, + omop_code, + invalid_reason, + origin_codes, + origin_system_ids, + origin_system_names + ) + select + grouped.phenotype_id, + grouped.phenotype_version_id, + concept.code as omop_id, + concept.description as omop_name, + concept.vocabulary_name as omop_vocabulary, + concept.vocabulary_code as omop_code, + concept.invalid_reason, + grouped.origin_codes, + grouped.origin_system_ids, + grouped.origin_system_names + from grouped + join public.clinicalcode_omop_codes as concept + on grouped.omop_id = concept.code; + + --[!] Compute concept map stats + with + input_cnt as ( + select + phenotype_id, + phenotype_version_id, + concept_id, + concept_version_id, + coding_system_id, + coding_system_name, + count(*) as input_size + from public.entity_codelists + group by phenotype_id, phenotype_version_id, concept_id, concept_version_id, coding_system_id, coding_system_name + ), + output_cnt as ( + select + phenotype_id, + phenotype_version_id, + concept_id, + concept_version_id, + count(*) as output_size + from public.omop_concept_result + group by phenotype_id, phenotype_version_id, concept_id, concept_version_id + ), + mapped_cnt as ( + select + entity.phenotype_id, + entity.phenotype_version_id, + entity.concept_id, + entity.concept_version_id, + count(distinct code) as mapped_count + from public.entity_codelists as entity + join omop_best_match as mapped + on entity.phenotype_id = mapped.phenotype_id + and entity.phenotype_version_id = mapped.phenotype_version_id + and entity.concept_id = mapped.concept_id + and entity.concept_version_id = mapped.concept_version_id + and entity.code = mapped.origin_code + group by entity.phenotype_id, entity.phenotype_version_id, entity.concept_id, entity.concept_version_id + ), + unmapped as ( + select + entity.phenotype_id, + entity.phenotype_version_id, + entity.concept_id, + entity.concept_version_id, + array_agg(distinct entity.code) as unmatched_codes + from public.entity_codelists as entity + left join omop_best_match as mapped + on entity.phenotype_id = mapped.phenotype_id + and entity.phenotype_version_id = mapped.phenotype_version_id + and entity.concept_id = mapped.concept_id + and entity.concept_version_id = mapped.concept_version_id + and entity.code = mapped.origin_code + where mapped.phenotype_id is null + group by entity.phenotype_id, entity.phenotype_version_id, entity.concept_id, entity.concept_version_id + ), + info as ( + select + input_cnt.phenotype_id, + input_cnt.phenotype_version_id, + input_cnt.concept_id, + input_cnt.concept_version_id, + input_cnt.coding_system_id, + input_cnt.coding_system_name, + input_cnt.input_size, + coalesce(output_cnt.output_size, 0) as output_size, + coalesce(mapped_cnt.mapped_count, 0) as mapped_count, + coalesce(unmapped.unmatched_codes, '{}'::text[]) as unmatched_codes + from input_cnt + left join output_cnt + using (phenotype_id, phenotype_version_id, concept_id, concept_version_id) + left join mapped_cnt + using (phenotype_id, phenotype_version_id, concept_id, concept_version_id) + left join unmapped + using (phenotype_id, phenotype_version_id, concept_id, concept_version_id) + ) + insert + into omop_concept_info( + phenotype_id, + phenotype_version_id, + concept_id, + concept_version_id, + coding_system_id, + coding_system_name, + input_size, + output_size, + mapped_count, + mapped_rate, + unmatched_codes + ) + select + info.phenotype_id, + info.phenotype_version_id, + info.concept_id, + info.concept_version_id, + info.coding_system_id, + info.coding_system_name, + info.input_size, + info.output_size, + info.mapped_count, + 100.0*(info.mapped_count::float / info.input_size::float) as mapped_rate, + info.unmatched_codes + from info; + + --[!] Compute phenotype map stats + with + counts as ( + select + phenotype_id, + phenotype_version_id, + array_agg(distinct coding_system_id) as coding_system_ids, + array_agg(distinct coding_system_name) as coding_system_names, + count(distinct coding_system_id) as coding_system_count, + array_agg( + case + when coding_system_id in (4,5,7,9,12,15,17,23,24,25,26) then coding_system_id + else null + end + ) as mapped_systems, + count(*) as total_size, + sum( + case + when coding_system_id in (4,5,7,9,12,15,17,23,24,25,26) then 1 + else 0 + end + ) as input_size + from public.entity_codelists as entity + group by phenotype_id, phenotype_version_id + ), + inputs as ( + select + counts.phenotype_id, + counts.phenotype_version_id, + counts.coding_system_ids, + counts.coding_system_names, + counts.coding_system_count, + array_remove(array(select distinct e from unnest(counts.mapped_systems) as a(e)), null) as mapped_systems, + counts.total_size, + counts.input_size + from counts + ), + output_cnt as ( + select + phenotype_id, + phenotype_version_id, + count(*) as output_size + from public.omop_mapping_result + group by phenotype_id, phenotype_version_id + ), + mapped_cnt as ( + select + entity.phenotype_id, + entity.phenotype_version_id, + count(distinct code) as mapped_count + from public.entity_codelists as entity + join omop_best_match as mapped + on entity.phenotype_id = mapped.phenotype_id + and entity.phenotype_version_id = mapped.phenotype_version_id + and entity.code = mapped.origin_code + group by entity.phenotype_id, entity.phenotype_version_id + ), + info as ( + select + inputs.phenotype_id, + inputs.phenotype_version_id, + inputs.coding_system_ids, + inputs.coding_system_names, + coalesce(inputs.coding_system_count, 0) as coding_system_count, + coalesce(array_length(inputs.mapped_systems, 1), 0) as mapped_system_count, + case + when coalesce(inputs.coding_system_count, 0) > 0 then 100.0*(coalesce(array_length(inputs.mapped_systems, 1), 0)::float / inputs.coding_system_count::float) + else 0 + end as mapped_system_rate, + coalesce(inputs.total_size, 0) as total_size, + coalesce(inputs.input_size, 0) as input_size, + coalesce(output_cnt.output_size, 0) as output_size, + coalesce(mapped_cnt.mapped_count, 0) as mapped_count, + case + when coalesce(inputs.input_size, 0) > 0 then 100.0*(coalesce(mapped_cnt.mapped_count, 0)::float / coalesce(inputs.input_size, 0)::float) + else 0 + end as mapped_rate, + case + when coalesce(inputs.total_size, 0) > 0 then 100.0*(coalesce(mapped_cnt.mapped_count, 0)::float / coalesce(inputs.total_size, 0)::float) + else 0 + end as total_rate + from inputs + left join output_cnt + using (phenotype_id, phenotype_version_id) + left join mapped_cnt + using (phenotype_id, phenotype_version_id) + ) + insert + into omop_phenotype_info( + phenotype_id, + phenotype_version_id, + coding_system_ids, + coding_system_names, + coding_system_count, + mapped_system_count, + mapped_system_rate, + total_size, + input_size, + output_size, + mapped_count, + mapped_rate, + total_rate + ) + select + phenotype_id, + phenotype_version_id, + coding_system_ids, + coding_system_names, + coding_system_count, + mapped_system_count, + mapped_system_rate, + total_size, + input_size, + output_size, + mapped_count, + mapped_rate, + total_rate + from info; + + --[!] Compute & resolve total resultset + open _cursor for + select + ( + select count(*) + from omop_phenotype_info as info + ) as phenotype_count, + ( + select count(*) + from omop_phenotype_info as info + where info.mapped_system_rate > 0 and info.total_size > 0 + ) as mapped_count, + ( + select 100.0*(t.mappable::float / t.total::float) + from ( + select + sum( + case + when info.mapped_system_rate > 0 and info.total_size > 0 then 1 + else 0 + end + ) as mappable, + count(*) as total + from omop_phenotype_info as info + ) t + ) as map_rate, + ( + select avg(info.mapped_rate) + from omop_phenotype_info as info + ) as average_incl_map_rate, + ( + select avg(info.total_rate) + from omop_phenotype_info as info + ) as average_excl_map_rate; +end; +$tx$ language plpgsql; +fetch all from _cursor; + + +--[!] Misc. +/* +select * + into _rec + from omop_mapping_result + limit 1; + +raise notice '%', to_json(_rec); + +-- for _rec in ( +-- select coding_system_id as coding_id, coding_system_name as coding_name +-- from public.entity_codelists +-- group by coding_system_id, coding_system_name +-- ) +-- loop +-- raise notice 'CodingSystem', _rec.coding_id, _rec.coding_name; +-- end loop; +*/ diff --git a/docs/omop/transform.ps1 b/docs/omop/transform.ps1 new file mode 100644 index 000000000..44800260e --- /dev/null +++ b/docs/omop/transform.ps1 @@ -0,0 +1,17 @@ +<# Export reformatted OMOP files #> +Get-ChildItem "./path/to/files" -Filter *.csv ` +| Foreach-Object { + Import-Csv -Path $_.FullName -Delimiter "`t" | Export-Csv -Path '.project/out/VOCABULARY.csv' -Encoding UTF8 -NoTypeInformation +} + +<# Export omop.relationships.csv #> +Import-CSV ` + -Path 'CONCEPT_RELATIONSHIP.csv' ` + -Header "code0_id","code1_id","relationship","valid_start_date","valid_end_date","invalid_reason" ` + | select -skip 1 ` + | Foreach-Object { + $_.'valid_start_date' = $($_.'valid_start_date' -replace '(?\d{4})(?\d{2})(?\d{2})', '${year}-${month}-${day}') + $_.'valid_end_date' = $($_.'valid_end_date' -replace '(?\d{4})(?\d{2})(?\d{2})', '${year}-${month}-${day}') + $_ + } ` + | Export-CSV -Path 'omop.relationships.csv' -Encoding UTF8 -NoTypeInformation