Skip to content

Commit 87fd735

Browse files
MoiCollgrahamgower
authored andcommitted
create functions linked_depth and independent_depth
1 parent f08cd85 commit 87fd735

File tree

2 files changed

+295
-6
lines changed

2 files changed

+295
-6
lines changed

notebook/simGL.ipynb

Lines changed: 241 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
},
1919
{
2020
"cell_type": "code",
21-
"execution_count": 1,
21+
"execution_count": 5,
2222
"id": "a3c58dad-95fa-4fe1-8971-521842ea4182",
2323
"metadata": {},
2424
"outputs": [],
@@ -38,7 +38,7 @@
3838
},
3939
{
4040
"cell_type": "code",
41-
"execution_count": 2,
41+
"execution_count": 6,
4242
"id": "966418dd-9400-405c-8983-a4714ad51704",
4343
"metadata": {},
4444
"outputs": [
@@ -4316,8 +4316,247 @@
43164316
{
43174317
"cell_type": "code",
43184318
"execution_count": null,
4319+
"id": "326f8a0e-61f5-42fb-871f-aaeb60b9b33a",
4320+
"metadata": {},
4321+
"outputs": [],
4322+
"source": []
4323+
},
4324+
{
4325+
"cell_type": "code",
4326+
"execution_count": null,
4327+
"id": "53c5c073-a47e-4d08-99e3-91ee91bb5f64",
4328+
"metadata": {},
4329+
"outputs": [],
4330+
"source": []
4331+
},
4332+
{
4333+
"cell_type": "code",
4334+
"execution_count": null,
4335+
"id": "007e3796-3095-42b7-b90b-6c536722a719",
4336+
"metadata": {},
4337+
"outputs": [],
4338+
"source": []
4339+
},
4340+
{
4341+
"cell_type": "code",
4342+
"execution_count": 1,
4343+
"id": "f1f616b5-1187-40c5-8736-cd1c8f5eb554",
4344+
"metadata": {},
4345+
"outputs": [],
4346+
"source": [
4347+
"import numpy as np"
4348+
]
4349+
},
4350+
{
4351+
"cell_type": "code",
4352+
"execution_count": 2,
43194353
"id": "d298a22c-d9fe-44d4-897f-e763d35cb7d9",
43204354
"metadata": {},
4355+
"outputs": [
4356+
{
4357+
"data": {
4358+
"text/plain": [
4359+
"array([9148, 9149, 9150, ..., 1937, 1938, 1939])"
4360+
]
4361+
},
4362+
"execution_count": 2,
4363+
"metadata": {},
4364+
"output_type": "execute_result"
4365+
}
4366+
],
4367+
"source": [
4368+
"seq_len = 37498\n",
4369+
"n_reads = 9527-12\n",
4370+
"read_length = 151\n",
4371+
"\n",
4372+
"\n",
4373+
"df_sim = np.array([int(x) for x in np.random.uniform(low=0.0, high=seq_len, size=n_reads)])\n",
4374+
"pos = []\n",
4375+
"for s in df_sim:\n",
4376+
" for i in range(s, s+read_length):\n",
4377+
" pos.append(i)\n",
4378+
"pos = np.array(pos)\n",
4379+
"pos"
4380+
]
4381+
},
4382+
{
4383+
"cell_type": "code",
4384+
"execution_count": 30,
4385+
"id": "35643811-795f-4387-9f9c-d99bfc17e3a8",
4386+
"metadata": {},
4387+
"outputs": [
4388+
{
4389+
"name": "stdout",
4390+
"output_type": "stream",
4391+
"text": [
4392+
"[15.37810676 13.95450312 14.17387291 10.11706523]\n"
4393+
]
4394+
}
4395+
],
4396+
"source": [
4397+
"def depth_per_haplotype(rng, mean_depth, std_depth, n_hap):\n",
4398+
" if isinstance(mean_depth, np.ndarray):\n",
4399+
" return mean_depth\n",
4400+
" else:\n",
4401+
" dp = np.full((n_hap, ), 0.0)\n",
4402+
" while (dp <= 0).sum():\n",
4403+
" n = (dp <= 0).sum()\n",
4404+
" dp[dp <= 0] = rng.normal(loc = mean_depth, scale = std_depth, size=n)\n",
4405+
" return dp\n",
4406+
"\n",
4407+
"gm = np.array([[0, 0, 1, 0], [1, 1, 0, 1]])\n",
4408+
"mean_depth = 15\n",
4409+
"e = 0.05\n",
4410+
"ploidy = 2\n",
4411+
"seed = 2\n",
4412+
"std_depth = 2\n",
4413+
"\n",
4414+
"err = np.array([[1-e, e/3, e/3, e/3], [e/3, 1-e, e/3, e/3], [e/3, e/3, 1-e, e/3], [e/3, e/3, e/3, 1-e]])\n",
4415+
"rng = np.random.default_rng(seed)\n",
4416+
"#1. Depths (DP) per haplotype (h)\n",
4417+
"DPh = depth_per_haplotype(rng, mean_depth, std_depth, gm.shape[1])\n",
4418+
"print(DPh)"
4419+
]
4420+
},
4421+
{
4422+
"cell_type": "code",
4423+
"execution_count": 10,
4424+
"id": "566fd8ee-ed0c-49c4-b326-83c6cd7e9aa0",
4425+
"metadata": {},
4426+
"outputs": [
4427+
{
4428+
"data": {
4429+
"text/plain": [
4430+
"array([15.37810676, 13.95450312, 14.17387291, 10.11706523])"
4431+
]
4432+
},
4433+
"execution_count": 10,
4434+
"metadata": {},
4435+
"output_type": "execute_result"
4436+
}
4437+
],
4438+
"source": [
4439+
"DPh"
4440+
]
4441+
},
4442+
{
4443+
"cell_type": "code",
4444+
"execution_count": 57,
4445+
"id": "259f5a19-f129-4251-beb6-3a9eaac155c4",
4446+
"metadata": {},
4447+
"outputs": [
4448+
{
4449+
"data": {
4450+
"text/plain": [
4451+
"(4, 10)"
4452+
]
4453+
},
4454+
"execution_count": 57,
4455+
"metadata": {},
4456+
"output_type": "execute_result"
4457+
}
4458+
],
4459+
"source": [
4460+
"def linked_depth(rng, DPh, read_length, sites_n):\n",
4461+
" '''\n",
4462+
" Simulates reads in a contiguous genomic region to compute the depth per position.\n",
4463+
" \n",
4464+
" Parameters\n",
4465+
" ----------\n",
4466+
" rng : `numpy.random._generator.Generator` \n",
4467+
" random number generation numpy object\n",
4468+
" DPh : `numpy.ndarray`\n",
4469+
" Numpy array with the depth per haplotype\n",
4470+
" read_length : `int`\n",
4471+
" Read length in base pair units\n",
4472+
" sites_n : `int`\n",
4473+
" number of sites that depth has to be simulated for\n",
4474+
" \n",
4475+
" Returns \n",
4476+
" -------\n",
4477+
" DP : `numpy.ndarray`\n",
4478+
" Depth per site per haplotype\n",
4479+
" '''\n",
4480+
" DP = []\n",
4481+
" read_n = ((DPh*sites_n)/read_length).astype(\"int\")\n",
4482+
" for r in read_n:\n",
4483+
" dp = np.zeros((sites_n,), dtype=int)\n",
4484+
" for p in rng.integers(low=0, high=sites_n-read_length+1, size=r):\n",
4485+
" dp[p:p+read_length] += 1\n",
4486+
" DP.append(dp.tolist())\n",
4487+
" return np.array(DP)\n",
4488+
" \n",
4489+
"linked_depth(rng, DPh, read_length = 2, sites_n = 10)"
4490+
]
4491+
},
4492+
{
4493+
"cell_type": "code",
4494+
"execution_count": 26,
4495+
"id": "08b04ef0-d54f-45f8-bff1-640916061ea3",
4496+
"metadata": {},
4497+
"outputs": [
4498+
{
4499+
"data": {
4500+
"text/plain": [
4501+
"array([5, 4, 1, 1, 1, 3, 7, 9, 4, 3])"
4502+
]
4503+
},
4504+
"execution_count": 26,
4505+
"metadata": {},
4506+
"output_type": "execute_result"
4507+
}
4508+
],
4509+
"source": [
4510+
"np.random.randint(low = 0, high = 10, size = 10)"
4511+
]
4512+
},
4513+
{
4514+
"cell_type": "code",
4515+
"execution_count": 38,
4516+
"id": "a03db2fd-3480-47fd-aa80-6c41a0f41cc6",
4517+
"metadata": {},
4518+
"outputs": [
4519+
{
4520+
"data": {
4521+
"text/plain": [
4522+
"(10,)"
4523+
]
4524+
},
4525+
"execution_count": 38,
4526+
"metadata": {},
4527+
"output_type": "execute_result"
4528+
}
4529+
],
4530+
"source": [
4531+
"np.arange(10).shape"
4532+
]
4533+
},
4534+
{
4535+
"cell_type": "code",
4536+
"execution_count": 58,
4537+
"id": "bc2d50fd-39e0-48fd-8088-31dd9c42bbb8",
4538+
"metadata": {},
4539+
"outputs": [
4540+
{
4541+
"data": {
4542+
"text/plain": [
4543+
"numpy.random._generator.Generator"
4544+
]
4545+
},
4546+
"execution_count": 58,
4547+
"metadata": {},
4548+
"output_type": "execute_result"
4549+
}
4550+
],
4551+
"source": [
4552+
"type(rng)"
4553+
]
4554+
},
4555+
{
4556+
"cell_type": "code",
4557+
"execution_count": null,
4558+
"id": "2a474f22-7871-42cd-8459-ea7197b1284a",
4559+
"metadata": {},
43214560
"outputs": [],
43224561
"source": []
43234562
}

simGL/simGL.py

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,43 @@ def refalt_int_encoding(gm, ref, alt):
5555
refalt_int[refalt_str == "T"] = 3
5656
return refalt_int[gm.reshape(-1), np.repeat(np.arange(gm.shape[0]), gm.shape[1])].reshape(gm.shape)
5757

58-
def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = None, ref = None, alt = None):
58+
def linked_depth(rng, DPh, read_length, sites_n):
59+
'''
60+
Simulates reads in a contiguous genomic region to compute the depth per position.
61+
62+
Parameters
63+
----------
64+
rng : `numpy.random._generator.Generator`
65+
random number generation numpy object
66+
DPh : `numpy.ndarray`
67+
Numpy array with the depth per haplotype
68+
read_length : `int`
69+
Read length in base pair units
70+
sites_n : `int`
71+
number of sites that depth has to be simulated for
72+
73+
Returns
74+
-------
75+
DP : `numpy.ndarray`
76+
Depth per site per haplotype
77+
'''
78+
DP = []
79+
read_n = ((DPh*sites_n)/read_length).astype("int")
80+
for r in read_n:
81+
dp = np.zeros((sites_n,), dtype=int)
82+
for p in rng.integers(low=0, high=sites_n-read_length+1, size=r):
83+
dp[p:p+read_length] += 1
84+
DP.append(dp.tolist())
85+
return np.array(DP)
86+
87+
def independent_depth(rng, DPh, size):
88+
'''
89+
Returns depth per position per haplotype (size[0], size[1]) drawn from the "rng" from a Poisson
90+
distribution with a lambda value "DPh" per haplotype
91+
'''
92+
return rng.poisson(DPh, size=size)
93+
94+
def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = None, ref = None, alt = None, read_length = None, depth_type = "independent"):
5995
'''
6096
Simulates allele read counts from a genotype matrix.
6197
@@ -117,14 +153,18 @@ def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = Non
117153
if ref is None and alt is None:
118154
ref = np.full(gm.shape[0], "A")
119155
alt = np.full(gm.shape[0], "C")
120-
assert check_mean_depth(gm, mean_depth) and check_std_depth(mean_depth, std_depth) and check_e(e) and check_ploidy(ploidy) and check_gm_ploidy(gm, ploidy) and check_ref_alt(gm, ref, alt)
156+
assert check_mean_depth(gm, mean_depth) and check_std_depth(mean_depth, std_depth) and check_e(e) and check_ploidy(ploidy) and check_gm_ploidy(gm, ploidy) and check_ref_alt(gm, ref, alt) and check_depth_type(depth_type)
121157
#Variables
122158
err = np.array([[1-e, e/3, e/3, e/3], [e/3, 1-e, e/3, e/3], [e/3, e/3, 1-e, e/3], [e/3, e/3, e/3, 1-e]])
123159
rng = np.random.default_rng(seed)
124160
#1. Depths (DP) per haplotype (h)
125161
DPh = depth_per_haplotype(rng, mean_depth, std_depth, gm.shape[1], ploidy)
126162
#2. Sample depths (DP) per site per haplotype
127-
DP = rng.poisson(DPh, size=gm.shape)
163+
if depth_type == "independent":
164+
DP = independent_depth(rng, DPh, gm.shape)
165+
elif depth_type == "linked":
166+
assert check_positive_nonzero_integer(read_length, "read_length")
167+
DP = linked_depth(rng, DPh, read_length, gm.shape[0])
128168
#3. Sample correct and error reads per SNP per haplotype (Rh)
129169
#3.1. Convert anc = 0/der = 1 encoded gm into "A" = 0, "C" = 1, "G" = 3, "T" = 4 basepair (bp) encoded gm
130170
gmbp = refalt_int_encoding(gm, ref, alt)
@@ -289,7 +329,17 @@ def check_gm_ploidy(gm, ploidy):
289329
if not (gm.shape[1]%ploidy == 0) :
290330
raise TypeError('Incorrect ploidy and/or gm format: the second dimention of gm (haplotypic samples) must be divisible by ploidy')
291331
return True
292-
332+
333+
def check_depth_type(depth_type):
334+
if not isinstance(depth_type, str) and depth_type not in ["independent", "linked"]:
335+
raise TypeError('Incorrect depth_type format: it has to be a string, either "independent" or "linked"')
336+
return True
337+
338+
def check_positive_nonzero_integer(read_length, name):
339+
if not isinstance(read_length, int) and read_length <= 0:
340+
raise TypeError('Incorrect {} format: it has to be a integer value > 0'.format(name))
341+
return True
342+
293343
def check_ref_alt(gm, ref, alt):
294344
if not (isinstance(ref, np.ndarray) and isinstance(alt, np.ndarray) and len(ref.shape) == 1 and len(alt.shape) == 1 and ref.shape == alt.shape and ref.size == gm.shape[0] and
295345
((ref == "A") + (ref == "C") + (ref == "G") + (ref == "T")).sum() == ref.size and ((alt == "A") + (alt == "C") + (alt == "G") + (alt == "T")).sum() == alt.size):

0 commit comments

Comments
 (0)