Skip to content

Commit d869194

Browse files
committed
correct quality <-> error functions and adding some description in functions
1 parent 650e4cb commit d869194

File tree

2 files changed

+143
-22
lines changed

2 files changed

+143
-22
lines changed

notebook/simGL.ipynb

Lines changed: 126 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,10 @@
1818
},
1919
{
2020
"cell_type": "code",
21-
"execution_count": 4,
21+
"execution_count": 1,
2222
"id": "a3c58dad-95fa-4fe1-8971-521842ea4182",
2323
"metadata": {},
24-
"outputs": [
25-
{
26-
"name": "stdout",
27-
"output_type": "stream",
28-
"text": [
29-
"The rpy2.ipython extension is already loaded. To reload it, use:\n",
30-
" %reload_ext rpy2.ipython\n"
31-
]
32-
}
33-
],
24+
"outputs": [],
3425
"source": [
3526
"import time\n",
3627
"import numpy as np\n",
@@ -48,10 +39,28 @@
4839
},
4940
{
5041
"cell_type": "code",
51-
"execution_count": 5,
42+
"execution_count": 2,
5243
"id": "966418dd-9400-405c-8983-a4714ad51704",
5344
"metadata": {},
54-
"outputs": [],
45+
"outputs": [
46+
{
47+
"name": "stderr",
48+
"output_type": "stream",
49+
"text": [
50+
"R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──\n",
51+
"\n",
52+
"R[write to console]: ✔ tibble 3.1.7 ✔ dplyr 1.0.9\n",
53+
"✔ tidyr 1.2.0 ✔ stringr 1.4.0\n",
54+
"✔ readr 2.1.2 ✔ forcats 0.5.1\n",
55+
"✔ purrr 0.3.4 \n",
56+
"\n",
57+
"R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n",
58+
"✖ dplyr::filter() masks stats::filter()\n",
59+
"✖ dplyr::lag() masks stats::lag()\n",
60+
"\n"
61+
]
62+
}
63+
],
5564
"source": [
5665
"%%R\n",
5766
"\n",
@@ -6819,6 +6828,110 @@
68196828
"id": "dca5ddb8-285c-4677-887e-a0fb3e8f17d6",
68206829
"metadata": {},
68216830
"outputs": [],
6831+
"source": [
6832+
"-10 * log(0.000001) = 60"
6833+
]
6834+
},
6835+
{
6836+
"cell_type": "code",
6837+
"execution_count": 3,
6838+
"id": "7846e2d4-dc88-46fd-9f15-dff97b13f9ba",
6839+
"metadata": {},
6840+
"outputs": [
6841+
{
6842+
"data": {
6843+
"text/plain": [
6844+
"60.0"
6845+
]
6846+
},
6847+
"execution_count": 3,
6848+
"metadata": {},
6849+
"output_type": "execute_result"
6850+
}
6851+
],
6852+
"source": [
6853+
"-10*np.log10(0.000001)"
6854+
]
6855+
},
6856+
{
6857+
"cell_type": "code",
6858+
"execution_count": null,
6859+
"id": "be8f646b-ea5d-4dc5-b84e-7b46390979ad",
6860+
"metadata": {},
6861+
"outputs": [],
6862+
"source": [
6863+
"-10*np.log10(0.000001)"
6864+
]
6865+
},
6866+
{
6867+
"cell_type": "code",
6868+
"execution_count": 6,
6869+
"id": "1d8b345c-771b-4bd8-b3ce-e6f4bc9e180f",
6870+
"metadata": {},
6871+
"outputs": [
6872+
{
6873+
"data": {
6874+
"text/plain": [
6875+
"1e-06"
6876+
]
6877+
},
6878+
"execution_count": 6,
6879+
"metadata": {},
6880+
"output_type": "execute_result"
6881+
}
6882+
],
6883+
"source": [
6884+
"np.power(10, -(60/10))"
6885+
]
6886+
},
6887+
{
6888+
"cell_type": "code",
6889+
"execution_count": 8,
6890+
"id": "94eb0a1c-fa6c-4970-bf09-59a8cd017d8c",
6891+
"metadata": {},
6892+
"outputs": [
6893+
{
6894+
"data": {
6895+
"text/plain": [
6896+
"60.0"
6897+
]
6898+
},
6899+
"execution_count": 8,
6900+
"metadata": {},
6901+
"output_type": "execute_result"
6902+
}
6903+
],
6904+
"source": [
6905+
"-10*np.log10(0.000001)"
6906+
]
6907+
},
6908+
{
6909+
"cell_type": "code",
6910+
"execution_count": 9,
6911+
"id": "e8b5e154-b1b3-4a7c-af06-e74b1f1648b8",
6912+
"metadata": {},
6913+
"outputs": [
6914+
{
6915+
"data": {
6916+
"text/plain": [
6917+
"0.0024787521766663585"
6918+
]
6919+
},
6920+
"execution_count": 9,
6921+
"metadata": {},
6922+
"output_type": "execute_result"
6923+
}
6924+
],
6925+
"source": [
6926+
"np.exp(-60/10)"
6927+
]
6928+
},
6929+
{
6930+
"cell_type": "code",
6931+
"execution_count": null,
6932+
"id": "a17f675b-2961-4365-aa28-7a77f955e6ee",
6933+
"metadata": {},
6934+
"outputs": [],
68226935
"source": []
68236936
}
68246937
],

simGL/simGL.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
from scipy.stats import binom
55

66
def e2q(e):
7-
return -10*np.log(e)
7+
return -10*np.log10(e)
88

99
def q2e(q):
10-
return np.exp(-q/10)
10+
return np.power(10, -(q/10))
1111

1212
def incorporate_monomorphic(gm, pos, start, end):
1313
'''
@@ -46,7 +46,7 @@ def refalt(ref, alt, n_sit):
4646
if ref is None and alt is None:
4747
ref = np.full(n_sit, "A")
4848
alt = np.full(n_sit, "C")
49-
return ref, alt
49+
return ref, alt
5050

5151
def depth_per_haplotype(rng, mean_depth, std_depth, n_hap):
5252
if isinstance(mean_depth, np.ndarray):
@@ -66,7 +66,7 @@ def refalt_int_encoding(gm, ref, alt):
6666
refalt_int[refalt_str == "T"] = 3
6767
return refalt_int[gm.reshape(-1), np.repeat(np.arange(gm.shape[0]), gm.shape[1])].reshape(gm.shape)
6868

69-
def linked_depth(rng, DPh, read_length, sites_n):
69+
def linked_depth(rng, DPh, read_length, n_sit):
7070
'''
7171
Simulates reads in a contiguous genomic region to compute the depth per position.
7272
@@ -78,7 +78,7 @@ def linked_depth(rng, DPh, read_length, sites_n):
7878
Numpy array with the depth per haplotype
7979
read_length : `int`
8080
Read length in base pair units
81-
sites_n : `int`
81+
n_sit : `int`
8282
number of sites that depth has to be simulated for
8383
8484
Returns
@@ -87,10 +87,10 @@ def linked_depth(rng, DPh, read_length, sites_n):
8787
Depth per site per haplotype
8888
'''
8989
DP = []
90-
read_n = ((DPh*sites_n)/read_length).astype("int")
90+
read_n = ((DPh*n_sit)/read_length).astype("int")
9191
for r in read_n:
92-
dp = np.zeros((sites_n,), dtype=int)
93-
for p in rng.integers(low=0, high=sites_n-read_length+1, size=r):
92+
dp = np.zeros((n_sit,), dtype=int)
93+
for p in rng.integers(low=0, high=n_sit-read_length+1, size=r):
9494
dp[p:p+read_length] += 1
9595
DP.append(dp.tolist())
9696
return np.array(DP).T
@@ -150,7 +150,7 @@ def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = Non
150150
(haplotypic samples, )) and the order must be the same as the second dimention of `gm`.
151151
152152
ploidy : `int`
153-
Number of haplotypic chromosomes per individual.
153+
Number of haplotypic chromosomes per individual. It is recomended to read Notes about ploidy.
154154
155155
ref : `numpy.ndarray`, optional
156156
Reference alleles list per site. The size of the array must be (sites, ) and the order has to
@@ -181,6 +181,14 @@ def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = Non
181181
must be 15.
182182
- If monomorphic sites are included, the `alt` values corresponding to those sites are not taken into account,
183183
but they must be still indicated.
184+
- Regarding ploidy, if the error parameter is specified as a constant for all individuals, the user can specify
185+
the desired ploidy of the organisms simulated.
186+
If different error rate per haplotype is inputed and the user wants to compute Genotype Likelihoods (GL) for
187+
organisms with ploidy > 1, ploidy should be equal to 1 for this function, and when the later function
188+
`allelereadcounts_to_GL()` is used, then, the desired ploidy can be specified. This is because the error values
189+
must be inputed again to compute GL and if ploidy > 1 is specified for this function, the dimentions of `arc`
190+
will be smaller than the dimentions of `e`. Nonetheless, if the user desires to obtain the output `arc` in
191+
a certain ploidy, one can use `ploidy_sum(arc, ploidy)` fucntion.
184192
'''
185193
#Checks
186194
assert check_gm(gm)

0 commit comments

Comments
 (0)