@@ -2142,3 +2142,154 @@ def pedigree_sel(
2142
2142
if drop_parent :
2143
2143
new_ds = new_ds .drop_vars (parent_var )
2144
2144
return new_ds
2145
+
2146
+
2147
+ def pedigree_contribution (
2148
+ ds : Dataset ,
2149
+ * ,
2150
+ method : Literal ["even" , "variable" ] = "even" ,
2151
+ chunks : Hashable = - 1 ,
2152
+ parent : Hashable = variables .parent ,
2153
+ stat_Hamilton_Kerr_tau : Hashable = variables .stat_Hamilton_Kerr_tau ,
2154
+ merge : bool = True ,
2155
+ ) -> Dataset :
2156
+ """Calculate the expected genomic contribution of each sample to each
2157
+ other sample based on pedigree structure.
2158
+
2159
+ Parameters
2160
+ ----------
2161
+ ds
2162
+ Dataset containing pedigree structure.
2163
+ method
2164
+ The method used for estimating genomic contributions.
2165
+ The 'even' method assumes that all samples are of a single,
2166
+ even ploidy (e.g., diploid) and have even contributions
2167
+ from each parent. The 'variable' method allows for un-even
2168
+ contributions due to ploidy manipulation and/or clonal
2169
+ reproduction.
2170
+ chunks:
2171
+ Optionally specify chunks for the returned array. A single chunk
2172
+ is used by default. Currently, chunking is only supported for a
2173
+ single axis.
2174
+ parent
2175
+ Input variable name holding parents of each sample as defined by
2176
+ :data:`sgkit.variables.parent_spec`.
2177
+ If the variable is not present in ``ds``, it will be computed
2178
+ using :func:`parent_indices`.
2179
+ stat_Hamilton_Kerr_tau
2180
+ Input variable name holding stat_Hamilton_Kerr_tau as defined
2181
+ by :data:`sgkit.variables.stat_Hamilton_Kerr_tau_spec`.
2182
+ This variable is only required for the 'variable' method.
2183
+ merge
2184
+ If True (the default), merge the input dataset and the computed
2185
+ output variables into a single dataset, otherwise return only
2186
+ the computed output variables.
2187
+
2188
+ Returns
2189
+ -------
2190
+ A dataset containing :data:`sgkit.variables.stat_pedigree_contribution_spec`.
2191
+
2192
+ Raises
2193
+ ------
2194
+ ValueError
2195
+ If an unknown method is specified.
2196
+ ValueError
2197
+ If the 'even' method is specified for an odd-ploidy dataset.
2198
+ ValueError
2199
+ If the 'even' method is specified and the length of the 'parents'
2200
+ dimension is not 2.
2201
+ NotImplementedError
2202
+ If chunking is specified for both axes.
2203
+
2204
+ Note
2205
+ ----
2206
+ Dimensions of :data:`sgkit.variables.stat_pedigree_contribution_spec`
2207
+ are named ``samples_0`` and ``samples_1``.
2208
+
2209
+ Examples
2210
+ --------
2211
+ >>> ds = xr.Dataset()
2212
+ >>> ds["sample_id"] = "samples", ["S0", "S1", "S2", "S3", "S4", "S5"]
2213
+ >>> ds["parent_id"] = ["samples", "parents"], [
2214
+ ... [ ".", "."],
2215
+ ... [ ".", "."],
2216
+ ... ["S1", "S0"],
2217
+ ... ["S2", "S0"],
2218
+ ... ["S0", "S2"],
2219
+ ... ["S1", "S3"]
2220
+ ... ]
2221
+ >>> ds = pedigree_contribution(ds)
2222
+ >>> ds.stat_pedigree_contribution.values # doctest: +NORMALIZE_WHITESPACE
2223
+ array([[1. , 0. , 0.5 , 0.75 , 0.75 , 0.375],
2224
+ [0. , 1. , 0.5 , 0.25 , 0.25 , 0.625],
2225
+ [0. , 0. , 1. , 0.5 , 0.5 , 0.25 ],
2226
+ [0. , 0. , 0. , 1. , 0. , 0.5 ],
2227
+ [0. , 0. , 0. , 0. , 1. , 0. ],
2228
+ [0. , 0. , 0. , 0. , 0. , 1. ]])
2229
+ """
2230
+ from .pedigree_numba_fns import contribution_from , contribution_to
2231
+
2232
+ if method not in {"even" , "variable" }:
2233
+ raise ValueError ("Unknown method '{}'" .format (method ))
2234
+ ds = define_variable_if_absent (
2235
+ ds ,
2236
+ variables .parent ,
2237
+ parent ,
2238
+ parent_indices ,
2239
+ )
2240
+ variables .validate (ds , {parent : variables .parent_spec })
2241
+ parent = da .asarray (ds [parent ].data , chunks = ds [parent ].shape )
2242
+ n_sample , n_parent = parent .shape
2243
+ if method == "even" :
2244
+ if bool (ds .dims .get ("ploidy" , 2 ) % 2 ):
2245
+ raise ValueError ("The 'even' method requires an even-ploidy dataset" )
2246
+ if n_parent != 2 :
2247
+ raise ValueError (
2248
+ "The 'even' requires that the 'parents' dimension has a length of 2"
2249
+ )
2250
+ parent_contribution = da .ones_like (parent , float ) / 2
2251
+ else :
2252
+ variables .validate (
2253
+ ds , {stat_Hamilton_Kerr_tau : variables .stat_Hamilton_Kerr_tau_spec }
2254
+ )
2255
+ tau = da .asarray (
2256
+ ds [stat_Hamilton_Kerr_tau ].data , chunks = ds [stat_Hamilton_Kerr_tau ].shape
2257
+ )
2258
+ parent_contribution = tau / tau .sum (axis = - 1 , keepdims = True )
2259
+ order = da .gufunc (
2260
+ topological_argsort ,
2261
+ signature = "(n,p)->(n)" ,
2262
+ output_dtypes = np .uint64 ,
2263
+ )(parent )
2264
+ # create a dummy array for dask to work out chunks
2265
+ chunks_0 , chunks_1 = da .empty ((n_sample , n_sample ), chunks = chunks ).chunks
2266
+ if (len (chunks_0 ) > 1 ) and (len (chunks_1 ) > 1 ):
2267
+ raise NotImplementedError ("Chunking is only supported along a single axis" )
2268
+ elif len (chunks_0 ) == 1 :
2269
+ targets = da .arange (n_sample , chunks = chunks_1 )
2270
+ # transpose so that [x, y] is contribution of x to y
2271
+ # TODO: dask already adds a transpose when calling the gufunc so
2272
+ # avoiding that initial transpose somehow should be more efficient
2273
+ contribution = contribution_to (
2274
+ parent ,
2275
+ parent_contribution ,
2276
+ order ,
2277
+ targets ,
2278
+ ).T
2279
+ else :
2280
+ targets = da .arange (n_sample , chunks = chunks_0 )
2281
+ contribution = contribution_from (
2282
+ parent ,
2283
+ parent_contribution ,
2284
+ order ,
2285
+ targets ,
2286
+ )
2287
+ new_ds = create_dataset (
2288
+ {
2289
+ variables .stat_pedigree_contribution : (
2290
+ ("samples_0" , "samples_1" ),
2291
+ contribution ,
2292
+ )
2293
+ }
2294
+ )
2295
+ return conditional_merge_datasets (ds , new_ds , merge )
0 commit comments