From dcfc110c256e514562f1eadf5ac9df9ef7f7a3a5 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 4 Sep 2024 16:14:23 +0100 Subject: [PATCH 1/9] Improve test_count_variant_genotypes__biallelic --- sgkit/tests/test_aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sgkit/tests/test_aggregation.py b/sgkit/tests/test_aggregation.py index 5ed538d03..c154591b4 100644 --- a/sgkit/tests/test_aggregation.py +++ b/sgkit/tests/test_aggregation.py @@ -400,11 +400,11 @@ def count_biallelic_genotypes(calls, ploidy): calls = ds.call_genotype.values expect = count_biallelic_genotypes(calls, ploidy) if chunked: - # chunk each dim + # chunk each dim except ploidy chunks = ( (n_variant // 2, n_variant - n_variant // 2), (n_sample // 2, n_sample - n_sample // 2), - (ploidy // 2, ploidy - ploidy // 2), + -1, ) ds["call_genotype"] = ds["call_genotype"].chunk(chunks) actual = count_variant_genotypes(ds)["variant_genotype_count"].data From f6a0ed750cb242ac134bab51f37bd9edb0c7b6b9 Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 26 Sep 2023 14:53:15 +0100 Subject: [PATCH 2/9] popgen progress --- sgkit/stats/aggregation.py | 6 ++---- sgkit/stats/cohort_numba_fns.py | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py index 9f3862985..2cb5d0ef9 100644 --- a/sgkit/stats/aggregation.py +++ b/sgkit/stats/aggregation.py @@ -273,10 +273,8 @@ def count_cohort_alleles( ds, variables.call_allele_count, call_allele_count, count_call_alleles ) variables.validate(ds, {call_allele_count: variables.call_allele_count_spec}) - # ensure cohorts is a numpy array to minimize dask task - # dependencies between chunks in other dimensions - AC, SC = da.asarray(ds[call_allele_count]), ds[sample_cohort].values - n_cohorts = SC.max() + 1 # 0-based indexing + AC, SC = da.asarray(ds[call_allele_count]), da.asarray(ds[sample_cohort]) + n_cohorts = ds[sample_cohort].values.max() + 1 # 0-based indexing AC = cohort_sum(AC, SC, n_cohorts, axis=1) new_ds = create_dataset( {variables.cohort_allele_count: (("variants", "cohorts", "alleles"), AC)} diff --git a/sgkit/stats/cohort_numba_fns.py b/sgkit/stats/cohort_numba_fns.py index 91aa54dd7..8fcf88296 100644 --- a/sgkit/stats/cohort_numba_fns.py +++ b/sgkit/stats/cohort_numba_fns.py @@ -43,19 +43,19 @@ def cohort_reduction(gufunc: Callable) -> Callable: @wraps(gufunc) def func(x: ArrayLike, cohort: ArrayLike, n: int, axis: int = -1) -> ArrayLike: - x = da.swapaxes(da.asarray(x), axis, -1) + x = da.moveaxis(da.asarray(x), [axis, -1], [-1, axis]) replaced = len(x.shape) - 1 chunks = x.chunks[0:-1] + (n,) out = da.map_blocks( gufunc, x, cohort, - np.empty(n, np.int8), + da.empty(n, dtype=np.int8), chunks=chunks, drop_axis=replaced, new_axis=replaced, ) - return da.swapaxes(out, axis, -1) + return da.moveaxis(out, [axis, -1], [-1, axis]) return func From 345de78ecab951e044ab49f32757f9cb5d8734f5 Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 6 Dec 2024 17:12:32 +0000 Subject: [PATCH 3/9] cohort_reduction dtype --- sgkit/stats/cohort_numba_fns.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sgkit/stats/cohort_numba_fns.py b/sgkit/stats/cohort_numba_fns.py index 8fcf88296..796f1190f 100644 --- a/sgkit/stats/cohort_numba_fns.py +++ b/sgkit/stats/cohort_numba_fns.py @@ -51,6 +51,7 @@ def func(x: ArrayLike, cohort: ArrayLike, n: int, axis: int = -1) -> ArrayLike: x, cohort, da.empty(n, dtype=np.int8), + dtype=x.dtype, # TODO: is this right? Might need to be passed in. chunks=chunks, drop_axis=replaced, new_axis=replaced, From 7d10ae4845cee5554f1e5c3aa20459304f2d872f Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 26 Sep 2023 15:37:00 +0100 Subject: [PATCH 4/9] change sum from method to function --- sgkit/stats/aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py index 2cb5d0ef9..081e73d34 100644 --- a/sgkit/stats/aggregation.py +++ b/sgkit/stats/aggregation.py @@ -1058,7 +1058,7 @@ def individual_heterozygosity( variables.validate(ds, {call_allele_count: variables.call_allele_count_spec}) AC = da.asarray(ds.call_allele_count) - K = AC.sum(axis=-1) + K = da.sum(AC, axis=-1) # use nan denominator to avoid divide by zero with K - 1 K2 = da.where(K > 1, K, np.nan) AF = AC / K2[..., None] From 0fc8eb6a2253ac1dad574f733ff631a3f3563720 Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 6 Dec 2024 17:35:46 +0000 Subject: [PATCH 5/9] Fix diversity --- sgkit/stats/popgen.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py index e201dfc98..f16e2152f 100644 --- a/sgkit/stats/popgen.py +++ b/sgkit/stats/popgen.py @@ -92,12 +92,12 @@ def diversity( ac = ds[cohort_allele_count] an = ac.sum(axis=2) - n_pairs = an * (an - 1) / 2 - n_same = (ac * (ac - 1) / 2).sum(axis=2) + n_pairs = an * (an - 1) // 2 + n_same = (ac * (ac - 1) // 2).sum(axis=2) n_diff = n_pairs - n_same # replace zeros to avoid divide by zero error n_pairs_na = n_pairs.where(n_pairs != 0) - pi = n_diff / n_pairs_na + pi = n_diff.astype(np.float64) / n_pairs_na.astype(np.float64) if has_windows(ds): div = window_statistic( From 9438d6a740dde546359760b6c90ea0c2fd5fb93e Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 6 Dec 2024 17:49:46 +0000 Subject: [PATCH 6/9] Fix window_statistic for cubed --- sgkit/window.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sgkit/window.py b/sgkit/window.py index 2f0ba5cae..d0a6c154f 100644 --- a/sgkit/window.py +++ b/sgkit/window.py @@ -536,10 +536,10 @@ def window_statistic( chunk_offsets = _sizes_to_start_offsets(windows_per_chunk) - def blockwise_moving_stat(x: ArrayLike, block_info: Any = None) -> ArrayLike: - if block_info is None or len(block_info) == 0: + def blockwise_moving_stat(x: ArrayLike, block_id: Any = None) -> ArrayLike: + if block_id is None: return np.array([]) - chunk_number = block_info[0]["chunk-location"][0] + chunk_number = block_id[0] chunk_offset_start = chunk_offsets[chunk_number] chunk_offset_stop = chunk_offsets[chunk_number + 1] chunk_window_starts = rel_window_starts[chunk_offset_start:chunk_offset_stop] @@ -559,8 +559,9 @@ def blockwise_moving_stat(x: ArrayLike, block_info: Any = None) -> ArrayLike: depth = {0: depth} # new chunks are same except in first axis new_chunks = tuple([tuple(windows_per_chunk)] + list(desired_chunks[1:])) # type: ignore - return values.map_overlap( + return da.map_overlap( blockwise_moving_stat, + values, dtype=dtype, chunks=new_chunks, depth=depth, From 830f870af15e2f45dd6efbe88606f6e499dc173f Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 18 Dec 2024 14:25:19 +0000 Subject: [PATCH 7/9] popgen and windowing --- sgkit/stats/popgen.py | 2 +- sgkit/tests/test_popgen.py | 2 +- sgkit/tests/test_window.py | 2 +- sgkit/window.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py index f16e2152f..318fdd7fe 100644 --- a/sgkit/stats/popgen.py +++ b/sgkit/stats/popgen.py @@ -2,10 +2,10 @@ import itertools from typing import Hashable, Optional, Sequence, Tuple, Union -import dask.array as da import numpy as np from xarray import Dataset +import sgkit.distarray as da from sgkit.cohorts import _cohorts_to_array from sgkit.stats.utils import assert_array_shape from sgkit.utils import ( diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py index 251e0568b..3d2be338d 100644 --- a/sgkit/tests/test_popgen.py +++ b/sgkit/tests/test_popgen.py @@ -1,7 +1,6 @@ import itertools import allel -import dask.array as da import msprime # type: ignore import numpy as np import pytest @@ -11,6 +10,7 @@ from hypothesis import given, settings from hypothesis import strategies as st +import sgkit.distarray as da from sgkit import ( Fst, Garud_H, diff --git a/sgkit/tests/test_window.py b/sgkit/tests/test_window.py index 3c518f560..65b01168d 100644 --- a/sgkit/tests/test_window.py +++ b/sgkit/tests/test_window.py @@ -1,12 +1,12 @@ import re import allel -import dask.array as da import numpy as np import pandas as pd import pytest import xarray as xr +import sgkit.distarray as da from sgkit import ( simulate_genotype_call_dataset, window_by_interval, diff --git a/sgkit/window.py b/sgkit/window.py index d0a6c154f..7c8c64705 100644 --- a/sgkit/window.py +++ b/sgkit/window.py @@ -1,9 +1,9 @@ from typing import Any, Callable, Hashable, Iterable, Optional, Tuple, Union -import dask.array as da import numpy as np from xarray import Dataset +import sgkit.distarray as da from sgkit import variables from sgkit.model import get_contigs, num_contigs from sgkit.utils import conditional_merge_datasets, create_dataset From 65988dd9f7a46927ba0e9e63c8be09ca27499a47 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 18 Dec 2024 14:28:45 +0000 Subject: [PATCH 8/9] popgen and windowing --- sgkit/stats/cohort_numba_fns.py | 2 +- sgkit/tests/test_cohort_numba_fns.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sgkit/stats/cohort_numba_fns.py b/sgkit/stats/cohort_numba_fns.py index 796f1190f..a19f41808 100644 --- a/sgkit/stats/cohort_numba_fns.py +++ b/sgkit/stats/cohort_numba_fns.py @@ -1,9 +1,9 @@ from functools import wraps from typing import Callable -import dask.array as da import numpy as np +import sgkit.distarray as da from sgkit.accelerate import numba_guvectorize from ..typing import ArrayLike diff --git a/sgkit/tests/test_cohort_numba_fns.py b/sgkit/tests/test_cohort_numba_fns.py index 2239e3e56..3a1312581 100644 --- a/sgkit/tests/test_cohort_numba_fns.py +++ b/sgkit/tests/test_cohort_numba_fns.py @@ -1,7 +1,7 @@ -import dask.array as da import numpy as np import pytest +import sgkit.distarray as da from sgkit.stats.cohort_numba_fns import ( cohort_mean, cohort_nanmean, @@ -41,7 +41,7 @@ def _cohort_reduction(func, x, cohort, n, axis=-1): _random_cohort_data((20, 20), n=3, axis=-1, missing=0.3), _random_cohort_data((7, 103, 4), n=5, axis=1, scale=7, missing=0.3), _random_cohort_data( - ((3, 4), (50, 50, 3), 4), n=5, axis=1, scale=7, dtype=np.uint8 + ((4, 3), (50, 50, 3), 4), n=5, axis=1, scale=7, dtype=np.uint8 ), _random_cohort_data( ((6, 6), (50, 50, 7), (3, 1)), n=5, axis=1, scale=7, missing=0.3 From fff899d018d122c3f9143393d39b62c3eae97725 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 18 Dec 2024 14:56:59 +0000 Subject: [PATCH 9/9] Run popgen and windowing tests on Cubed --- .github/workflows/cubed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cubed.yml b/.github/workflows/cubed.yml index 0b3165fc4..0535cce81 100644 --- a/.github/workflows/cubed.yml +++ b/.github/workflows/cubed.yml @@ -30,4 +30,4 @@ jobs: - name: Test with pytest run: | - pytest -v sgkit/tests/test_{aggregation,association,hwe,pca}.py -k 'test_count_call_alleles or test_gwas_linear_regression or test_hwep or test_sample_stats or (test_count_variant_alleles and not test_count_variant_alleles__chunked[call_genotype]) or (test_variant_stats and not test_variant_stats__chunks[chunks2-False]) or (test_pca__array_backend and tsqr)' --use-cubed \ No newline at end of file + pytest -v sgkit/tests/test_{aggregation,association,hwe,pca}.py -k 'test_count_call_alleles or test_gwas_linear_regression or test_hwep or test_popgen or test_sample_stats or (test_count_variant_alleles and not test_count_variant_alleles__chunked[call_genotype]) or (test_variant_stats and not test_variant_stats__chunks[chunks2-False]) or (test_pca__array_backend and tsqr) or test_window' --use-cubed \ No newline at end of file