Skip to content

Commit 1374138

Browse files
Fix high memory usage bug with chunked HK relationship #1333
* Fix high memory usage bug with chunked HK relationship #1333 * Update changelog
1 parent af2b623 commit 1374138

File tree

3 files changed

+36
-4
lines changed

3 files changed

+36
-4
lines changed

docs/changelog.rst

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,12 @@ Deprecations
2828
.. Improvements
2929
.. ~~~~~~~~~~~~
3030
31-
.. Bug fixes
32-
.. ~~~~~~~~~
31+
Bug fixes
32+
~~~~~~~~~
33+
34+
- Fixed bug causing high memory usage in :func:`pedigree_kinship` when
35+
evaluating a chunked relationship matrix with the Hamilton-Kerr method.
36+
(:user:`timothymillar`, :pr:`1335`, :issue:`1333`)
3337

3438
.. Documentation
3539
.. ~~~~~~~~~~~~~

sgkit/stats/pedigree.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1198,10 +1198,15 @@ def pedigree_kinship(
11981198
# new dataset
11991199
dims = ["samples_0", "samples_1"]
12001200
if return_relationship:
1201-
relationship = kinship * 2
12021201
if method == EST_HAMILTON_KERR:
12031202
ploidy = tau.sum(axis=-1)
1204-
relationship *= np.sqrt(ploidy[None, :] / 2 * ploidy[:, None] / 2)
1203+
ploidy_0 = ploidy.rechunk((kinship.chunks[0],))
1204+
ploidy_1 = ploidy.rechunk((kinship.chunks[1],))
1205+
scale = da.sqrt(ploidy_0[:, None] * ploidy_1[None, :])
1206+
assert kinship.chunks == scale.chunks
1207+
relationship = kinship * scale
1208+
else:
1209+
relationship = kinship * 2
12051210
arrays = {
12061211
variables.stat_pedigree_kinship: xr.DataArray(kinship, dims=dims),
12071212
variables.stat_pedigree_relationship: xr.DataArray(relationship, dims=dims),

sgkit/tests/test_pedigree.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,29 @@ def test_pedigree_kinship__Hamilton_Kerr_compress_parent_dimension(
560560
np.testing.assert_array_almost_equal(actual, expect)
561561

562562

563+
def test_pedigree_kinship__Hamilton_Kerr_chunked_mem():
564+
# tests fix for https://github.com/sgkit-dev/sgkit/issues/1333
565+
# if chunking is not applied to the ploidy matrix correctly
566+
# this will try to allocate > 7TB of RAM (and presumably fail)
567+
ds = xr.Dataset()
568+
ds["parent"] = ["samples", "parents"], np.full((1_000_000, 2), -1, int)
569+
ds["stat_Hamilton_Kerr_tau"] = ["samples", "parents"], np.full(
570+
(1_000_000, 2), 2, np.uint64
571+
)
572+
ds["stat_Hamilton_Kerr_lambda"] = ["samples", "parents"], np.full(
573+
(1_000_000, 2), 0.0
574+
)
575+
matrix = sg.pedigree_kinship(
576+
ds,
577+
method="Hamilton-Kerr",
578+
chunks=((10, 999990), (20, 999980)),
579+
return_relationship=True,
580+
).stat_pedigree_relationship
581+
expect = np.eye(10, 20)
582+
actual = matrix[0:10, 0:20].compute()
583+
np.testing.assert_array_almost_equal(actual, expect)
584+
585+
563586
@pytest.mark.parametrize("method", ["diploid", "Hamilton-Kerr"])
564587
def test_pedigree_kinship__half_founder(method):
565588
ds0 = sg.simulate_genotype_call_dataset(n_variant=1, n_sample=6)

0 commit comments

Comments
 (0)