Skip to content

Commit b6a9cbb

Browse files
authored
Add two new cells (#85)
1 parent 7b0abff commit b6a9cbb

File tree

1 file changed

+99
-10
lines changed

1 file changed

+99
-10
lines changed

notebooks/analysis/mapping_analysis.ipynb

Lines changed: 99 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@
158158
"\n",
159159
"var_count = 0\n",
160160
"diff_vars_dict = {}\n",
161+
"concordant_vars_dict = {}\n",
161162
"\n",
162163
"for urn in score_sets:\n",
163164
" files = list(Path(\"analysis_files/mappings\").glob(f\"*{urn}_mapping*\"))\n",
@@ -172,6 +173,7 @@
172173
" dat = dat[\"mapped_scores\"]\n",
173174
"\n",
174175
" diff_vars = []\n",
176+
" concordant_vars = []\n",
175177
" strand = strand_dict[urn]\n",
176178
"\n",
177179
" for j,var_mapping in enumerate(dat):\n",
@@ -186,6 +188,8 @@
186188
"\n",
187189
" if not is_concordant(seq_pre, seq_post, seq_pre_rv, computed_seq_type, strand,):\n",
188190
" diff_vars.append(j)\n",
191+
" else:\n",
192+
" concordant_vars.append(j)\n",
189193
"\n",
190194
" else:\n",
191195
" for pre_mapped_var in var_mapping[\"pre_mapped\"][\"members\"]:\n",
@@ -198,7 +202,10 @@
198202
"\n",
199203
" if not is_concordant(seq_pre, seq_post, seq_pre_rv, computed_seq_type, strand,):\n",
200204
" diff_vars.append(j)\n",
205+
" else:\n",
206+
" concordant_vars.append(j)\n",
201207
" diff_vars_dict[urn] = diff_vars\n",
208+
" concordant_vars_dict[urn] = concordant_vars\n",
202209
"\n",
203210
"f\"The number of examined variant pairs is: {var_count}\""
204211
]
@@ -361,6 +368,39 @@
361368
"f\"There are {mm_count} instances of reference mismatch in the subset\""
362369
]
363370
},
371+
{
372+
"cell_type": "markdown",
373+
"id": "de7b8e56",
374+
"metadata": {},
375+
"source": [
376+
"Run the cell below to count the total number of score sets without discordant variants and the total corresponding variant count"
377+
]
378+
},
379+
{
380+
"cell_type": "code",
381+
"execution_count": 11,
382+
"id": "08d77cd1",
383+
"metadata": {},
384+
"outputs": [
385+
{
386+
"data": {
387+
"text/plain": [
388+
"'There are 3031995 concordant variants in this subset of 736 score sets'"
389+
]
390+
},
391+
"execution_count": 11,
392+
"metadata": {},
393+
"output_type": "execute_result"
394+
}
395+
],
396+
"source": [
397+
"concordant_ss = [key for key, diff_vars_list in diff_vars_dict.items() if not diff_vars_list]\n",
398+
"concordant_ss_variant_count = 0\n",
399+
"for key in concordant_ss:\n",
400+
" concordant_ss_variant_count = concordant_ss_variant_count + len(concordant_vars_dict[key])\n",
401+
"f\"There are {concordant_ss_variant_count} concordant variants in this subset of {len(concordant_ss)} score sets\""
402+
]
403+
},
364404
{
365405
"cell_type": "markdown",
366406
"id": "92bc5e87",
@@ -372,7 +412,7 @@
372412
},
373413
{
374414
"cell_type": "code",
375-
"execution_count": 11,
415+
"execution_count": 12,
376416
"id": "3eb301ed",
377417
"metadata": {},
378418
"outputs": [],
@@ -444,7 +484,7 @@
444484
},
445485
{
446486
"cell_type": "code",
447-
"execution_count": 12,
487+
"execution_count": 13,
448488
"id": "fc87cbbe",
449489
"metadata": {},
450490
"outputs": [
@@ -480,7 +520,7 @@
480520
},
481521
{
482522
"cell_type": "code",
483-
"execution_count": 13,
523+
"execution_count": 14,
484524
"id": "5d0969e2",
485525
"metadata": {},
486526
"outputs": [
@@ -490,7 +530,7 @@
490530
"'There are 2994178 MAVE variants that were processed in this analysis'"
491531
]
492532
},
493-
"execution_count": 13,
533+
"execution_count": 14,
494534
"metadata": {},
495535
"output_type": "execute_result"
496536
}
@@ -523,7 +563,7 @@
523563
},
524564
{
525565
"cell_type": "code",
526-
"execution_count": 14,
566+
"execution_count": 15,
527567
"id": "a4135f64",
528568
"metadata": {},
529569
"outputs": [
@@ -577,7 +617,7 @@
577617
},
578618
{
579619
"cell_type": "code",
580-
"execution_count": 15,
620+
"execution_count": 16,
581621
"id": "335af4a1",
582622
"metadata": {},
583623
"outputs": [
@@ -664,7 +704,7 @@
664704
"Interquartile Range for Variants in a Score Set... (920, 1794)"
665705
]
666706
},
667-
"execution_count": 15,
707+
"execution_count": 16,
668708
"metadata": {},
669709
"output_type": "execute_result"
670710
}
@@ -697,7 +737,7 @@
697737
},
698738
{
699739
"cell_type": "code",
700-
"execution_count": 16,
740+
"execution_count": 17,
701741
"id": "7ec89e27",
702742
"metadata": {},
703743
"outputs": [
@@ -742,7 +782,7 @@
742782
},
743783
{
744784
"cell_type": "code",
745-
"execution_count": 17,
785+
"execution_count": 18,
746786
"id": "971dbd8a",
747787
"metadata": {},
748788
"outputs": [],
@@ -759,7 +799,7 @@
759799
},
760800
{
761801
"cell_type": "code",
762-
"execution_count": 18,
802+
"execution_count": 19,
763803
"id": "29a0b28d",
764804
"metadata": {},
765805
"outputs": [
@@ -815,6 +855,55 @@
815855
"plt.savefig(\"mapped_variants_count.png\", dpi=300)\n",
816856
"plt.show()"
817857
]
858+
},
859+
{
860+
"cell_type": "markdown",
861+
"id": "feb4cfc1",
862+
"metadata": {},
863+
"source": [
864+
"Compute the number of score sets where VRS IDs are expected to be equal (i.e. the MAVE target sequence is the human reference sequence)"
865+
]
866+
},
867+
{
868+
"cell_type": "code",
869+
"execution_count": 20,
870+
"id": "67e8d4ab",
871+
"metadata": {},
872+
"outputs": [
873+
{
874+
"data": {
875+
"text/plain": [
876+
"'The number of score sets with equivalent target sequences and human reference sequences is 158 and the number with unequal sequences is 899'"
877+
]
878+
},
879+
"execution_count": 20,
880+
"metadata": {},
881+
"output_type": "execute_result"
882+
}
883+
],
884+
"source": [
885+
"mave_ref_equal_count = 0\n",
886+
"mave_ref_unequal_count = 0\n",
887+
"for urn in score_sets:\n",
888+
" if urn.startswith(\"urn:mavedb:00000097\"): # Edge cases where variants were mapped at protein level\n",
889+
" mave_ref_equal_count += 1\n",
890+
" else:\n",
891+
" files = list(Path(\"analysis_files/mappings\").glob(f\"*{urn}_mapping*\"))\n",
892+
" if files:\n",
893+
" latest_file = max(files, key=os.path.getmtime)\n",
894+
" else:\n",
895+
" continue\n",
896+
"\n",
897+
" f = Path.open(latest_file)\n",
898+
" dat = json.load(f)\n",
899+
" mave_seq = dat[\"computed_reference_sequence\"][\"sequence_id\"]\n",
900+
" ref_seq = dat[\"mapped_reference_sequence\"][\"sequence_id\"]\n",
901+
" if mave_seq == ref_seq:\n",
902+
" mave_ref_equal_count += 1\n",
903+
" else:\n",
904+
" mave_ref_unequal_count += 1\n",
905+
"f\"The number of score sets with equivalent target sequences and human reference sequences is {mave_ref_equal_count} and the number with unequal sequences is {mave_ref_unequal_count}\""
906+
]
818907
}
819908
],
820909
"metadata": {

0 commit comments

Comments
 (0)