|
158 | 158 | "\n", |
159 | 159 | "var_count = 0\n", |
160 | 160 | "diff_vars_dict = {}\n", |
| 161 | + "concordant_vars_dict = {}\n", |
161 | 162 | "\n", |
162 | 163 | "for urn in score_sets:\n", |
163 | 164 | " files = list(Path(\"analysis_files/mappings\").glob(f\"*{urn}_mapping*\"))\n", |
|
172 | 173 | " dat = dat[\"mapped_scores\"]\n", |
173 | 174 | "\n", |
174 | 175 | " diff_vars = []\n", |
| 176 | + " concordant_vars = []\n", |
175 | 177 | " strand = strand_dict[urn]\n", |
176 | 178 | "\n", |
177 | 179 | " for j,var_mapping in enumerate(dat):\n", |
|
186 | 188 | "\n", |
187 | 189 | " if not is_concordant(seq_pre, seq_post, seq_pre_rv, computed_seq_type, strand,):\n", |
188 | 190 | " diff_vars.append(j)\n", |
| 191 | + " else:\n", |
| 192 | + " concordant_vars.append(j)\n", |
189 | 193 | "\n", |
190 | 194 | " else:\n", |
191 | 195 | " for pre_mapped_var in var_mapping[\"pre_mapped\"][\"members\"]:\n", |
|
198 | 202 | "\n", |
199 | 203 | " if not is_concordant(seq_pre, seq_post, seq_pre_rv, computed_seq_type, strand,):\n", |
200 | 204 | " diff_vars.append(j)\n", |
| 205 | + " else:\n", |
| 206 | + " concordant_vars.append(j)\n", |
201 | 207 | " diff_vars_dict[urn] = diff_vars\n", |
| 208 | + " concordant_vars_dict[urn] = concordant_vars\n", |
202 | 209 | "\n", |
203 | 210 | "f\"The number of examined variant pairs is: {var_count}\"" |
204 | 211 | ] |
|
361 | 368 | "f\"There are {mm_count} instances of reference mismatch in the subset\"" |
362 | 369 | ] |
363 | 370 | }, |
| 371 | + { |
| 372 | + "cell_type": "markdown", |
| 373 | + "id": "de7b8e56", |
| 374 | + "metadata": {}, |
| 375 | + "source": [ |
| 376 | + "Run the cell below to count the total number of score sets without discordant variants and the total corresponding variant count" |
| 377 | + ] |
| 378 | + }, |
| 379 | + { |
| 380 | + "cell_type": "code", |
| 381 | + "execution_count": 11, |
| 382 | + "id": "08d77cd1", |
| 383 | + "metadata": {}, |
| 384 | + "outputs": [ |
| 385 | + { |
| 386 | + "data": { |
| 387 | + "text/plain": [ |
| 388 | + "'There are 3031995 concordant variants in this subset of 736 score sets'" |
| 389 | + ] |
| 390 | + }, |
| 391 | + "execution_count": 11, |
| 392 | + "metadata": {}, |
| 393 | + "output_type": "execute_result" |
| 394 | + } |
| 395 | + ], |
| 396 | + "source": [ |
| 397 | + "concordant_ss = [key for key, diff_vars_list in diff_vars_dict.items() if not diff_vars_list]\n", |
| 398 | + "concordant_ss_variant_count = 0\n", |
| 399 | + "for key in concordant_ss:\n", |
| 400 | + " concordant_ss_variant_count = concordant_ss_variant_count + len(concordant_vars_dict[key])\n", |
| 401 | + "f\"There are {concordant_ss_variant_count} concordant variants in this subset of {len(concordant_ss)} score sets\"" |
| 402 | + ] |
| 403 | + }, |
364 | 404 | { |
365 | 405 | "cell_type": "markdown", |
366 | 406 | "id": "92bc5e87", |
|
372 | 412 | }, |
373 | 413 | { |
374 | 414 | "cell_type": "code", |
375 | | - "execution_count": 11, |
| 415 | + "execution_count": 12, |
376 | 416 | "id": "3eb301ed", |
377 | 417 | "metadata": {}, |
378 | 418 | "outputs": [], |
|
444 | 484 | }, |
445 | 485 | { |
446 | 486 | "cell_type": "code", |
447 | | - "execution_count": 12, |
| 487 | + "execution_count": 13, |
448 | 488 | "id": "fc87cbbe", |
449 | 489 | "metadata": {}, |
450 | 490 | "outputs": [ |
|
480 | 520 | }, |
481 | 521 | { |
482 | 522 | "cell_type": "code", |
483 | | - "execution_count": 13, |
| 523 | + "execution_count": 14, |
484 | 524 | "id": "5d0969e2", |
485 | 525 | "metadata": {}, |
486 | 526 | "outputs": [ |
|
490 | 530 | "'There are 2994178 MAVE variants that were processed in this analysis'" |
491 | 531 | ] |
492 | 532 | }, |
493 | | - "execution_count": 13, |
| 533 | + "execution_count": 14, |
494 | 534 | "metadata": {}, |
495 | 535 | "output_type": "execute_result" |
496 | 536 | } |
|
523 | 563 | }, |
524 | 564 | { |
525 | 565 | "cell_type": "code", |
526 | | - "execution_count": 14, |
| 566 | + "execution_count": 15, |
527 | 567 | "id": "a4135f64", |
528 | 568 | "metadata": {}, |
529 | 569 | "outputs": [ |
|
577 | 617 | }, |
578 | 618 | { |
579 | 619 | "cell_type": "code", |
580 | | - "execution_count": 15, |
| 620 | + "execution_count": 16, |
581 | 621 | "id": "335af4a1", |
582 | 622 | "metadata": {}, |
583 | 623 | "outputs": [ |
|
664 | 704 | "Interquartile Range for Variants in a Score Set... (920, 1794)" |
665 | 705 | ] |
666 | 706 | }, |
667 | | - "execution_count": 15, |
| 707 | + "execution_count": 16, |
668 | 708 | "metadata": {}, |
669 | 709 | "output_type": "execute_result" |
670 | 710 | } |
|
697 | 737 | }, |
698 | 738 | { |
699 | 739 | "cell_type": "code", |
700 | | - "execution_count": 16, |
| 740 | + "execution_count": 17, |
701 | 741 | "id": "7ec89e27", |
702 | 742 | "metadata": {}, |
703 | 743 | "outputs": [ |
|
742 | 782 | }, |
743 | 783 | { |
744 | 784 | "cell_type": "code", |
745 | | - "execution_count": 17, |
| 785 | + "execution_count": 18, |
746 | 786 | "id": "971dbd8a", |
747 | 787 | "metadata": {}, |
748 | 788 | "outputs": [], |
|
759 | 799 | }, |
760 | 800 | { |
761 | 801 | "cell_type": "code", |
762 | | - "execution_count": 18, |
| 802 | + "execution_count": 19, |
763 | 803 | "id": "29a0b28d", |
764 | 804 | "metadata": {}, |
765 | 805 | "outputs": [ |
|
815 | 855 | "plt.savefig(\"mapped_variants_count.png\", dpi=300)\n", |
816 | 856 | "plt.show()" |
817 | 857 | ] |
| 858 | + }, |
| 859 | + { |
| 860 | + "cell_type": "markdown", |
| 861 | + "id": "feb4cfc1", |
| 862 | + "metadata": {}, |
| 863 | + "source": [ |
| 864 | + "Compute the number of score sets where VRS IDs are expected to be equal (i.e. the MAVE target sequence is the human reference sequence)" |
| 865 | + ] |
| 866 | + }, |
| 867 | + { |
| 868 | + "cell_type": "code", |
| 869 | + "execution_count": 20, |
| 870 | + "id": "67e8d4ab", |
| 871 | + "metadata": {}, |
| 872 | + "outputs": [ |
| 873 | + { |
| 874 | + "data": { |
| 875 | + "text/plain": [ |
| 876 | + "'The number of score sets with equivalent target sequences and human reference sequences is 158 and the number with unequal sequences is 899'" |
| 877 | + ] |
| 878 | + }, |
| 879 | + "execution_count": 20, |
| 880 | + "metadata": {}, |
| 881 | + "output_type": "execute_result" |
| 882 | + } |
| 883 | + ], |
| 884 | + "source": [ |
| 885 | + "mave_ref_equal_count = 0\n", |
| 886 | + "mave_ref_unequal_count = 0\n", |
| 887 | + "for urn in score_sets:\n", |
| 888 | + " if urn.startswith(\"urn:mavedb:00000097\"): # Edge cases where variants were mapped at protein level\n", |
| 889 | + " mave_ref_equal_count += 1\n", |
| 890 | + " else:\n", |
| 891 | + " files = list(Path(\"analysis_files/mappings\").glob(f\"*{urn}_mapping*\"))\n", |
| 892 | + " if files:\n", |
| 893 | + " latest_file = max(files, key=os.path.getmtime)\n", |
| 894 | + " else:\n", |
| 895 | + " continue\n", |
| 896 | + "\n", |
| 897 | + " f = Path.open(latest_file)\n", |
| 898 | + " dat = json.load(f)\n", |
| 899 | + " mave_seq = dat[\"computed_reference_sequence\"][\"sequence_id\"]\n", |
| 900 | + " ref_seq = dat[\"mapped_reference_sequence\"][\"sequence_id\"]\n", |
| 901 | + " if mave_seq == ref_seq:\n", |
| 902 | + " mave_ref_equal_count += 1\n", |
| 903 | + " else:\n", |
| 904 | + " mave_ref_unequal_count += 1\n", |
| 905 | + "f\"The number of score sets with equivalent target sequences and human reference sequences is {mave_ref_equal_count} and the number with unequal sequences is {mave_ref_unequal_count}\"" |
| 906 | + ] |
818 | 907 | } |
819 | 908 | ], |
820 | 909 | "metadata": { |
|
0 commit comments