@@ -8698,52 +8698,6 @@ def _parse_stat_matrix_sample_sets(ids):
86988698 sizes = np .array (sizes , dtype = size_dtype )
86998699 return flat , sizes
87008700
8701- # def divergence_matrix(self, sample_sets, windows=None, mode="site"):
8702- # """
8703- # Finds the mean divergence between pairs of samples from each set of
8704- # samples and in each window. Returns a numpy array indexed by (window,
8705- # sample_set, sample_set). Diagonal entries are corrected so that the
8706- # value gives the mean divergence for *distinct* samples, but it is not
8707- # checked whether the sample_sets are disjoint (so offdiagonals are not
8708- # corrected). For this reason, if an element of `sample_sets` has only
8709- # one element, the corresponding diagonal will be NaN.
8710-
8711- # The mean divergence between two samples is defined to be the mean: (as
8712- # a TreeStat) length of all edges separating them in the tree, or (as a
8713- # SiteStat) density of segregating sites, at a uniformly chosen position
8714- # on the genome.
8715-
8716- # :param list sample_sets: A list of sets of IDs of samples.
8717- # :param iterable windows: The breakpoints of the windows (including start
8718- # and end, so has one more entry than number of windows).
8719- # :return: A list of the upper triangle of mean TMRCA values in row-major
8720- # order, including the diagonal.
8721- # """
8722- # ns = len(sample_sets)
8723- # indexes = [(i, j) for i in range(ns) for j in range(i, ns)]
8724- # x = self.divergence(sample_sets, indexes, windows, mode=mode)
8725- # nw = len(windows) - 1
8726- # A = np.ones((nw, ns, ns), dtype=float)
8727- # for w in range(nw):
8728- # k = 0
8729- # for i in range(ns):
8730- # for j in range(i, ns):
8731- # A[w, i, j] = A[w, j, i] = x[w][k]
8732- # k += 1
8733- # return A
8734- # NOTE: see older definition of divmat here, which may be useful when documenting
8735- # this function. See https://github.com/tskit-dev/tskit/issues/2781
8736-
8737- # NOTE for documentation of sample_sets. We *must* use samples currently because
8738- # the normalisation for non-sample nodes is tricky. Do we normalise by the
8739- # total span of the ts where the node is 'present' in the tree? We avoid this
8740- # by insisting on sample nodes.
8741-
8742- # NOTE for documentation of num_threads. Need to explain that the
8743- # its best to think of as the number of background *worker* threads.
8744- # default is to run without any worker threads. If you want to run
8745- # with all the cores on the machine, use num_threads=os.cpu_count().
8746-
87478701 def divergence_matrix (
87488702 self ,
87498703 sample_sets = None ,
@@ -8753,6 +8707,41 @@ def divergence_matrix(
87538707 mode = None ,
87548708 span_normalise = True ,
87558709 ):
8710+ """
8711+ Finds the matrix of pairwise :meth:`.divergence` values between groups
8712+ of sample nodes. Returns a numpy array indexed by (window,
8713+ sample_set, sample_set): the [k,i,j]th value of the result gives the
8714+ mean divergence between pairs of samples from the i-th and j-th
8715+ sample sets in the k-th window. As for :meth:`.divergence`,
8716+ diagonal entries are corrected so that the
8717+ value gives the mean divergence for *distinct* samples,
8718+ and so diagonal entries are given by the :meth:`.diversity` of that
8719+ sample set. For this reason, if an element of `sample_sets` has only
8720+ one element, the corresponding :meth:`.diversity` will be NaN.
8721+ However, this method will place a value of 0 in the diagonal instead of NaN
8722+ in such cases; otherwise, this is equivalent to computing values with
8723+ `meth`:.divergence`.
8724+ However, this is (usually) more efficient than computing many
8725+ pairwise values using the `indexes` argument to :meth:`.divergence`,
8726+ so see :meth:`.divergence` for a description of what exactly is computed.
8727+
8728+ :param list sample_sets: A list of sets of IDs of samples.
8729+ :param list windows: The breakpoints of the windows (including start
8730+ and end, so has one more entry than number of windows).
8731+ :param str mode: A string giving the "type" of the statistic to be computed
8732+ (defaults to "site"; the other option is "branch").
8733+ :return: An array indexed by (window, sample_set, sample_set), or if windows is
8734+ `None`, an array indexed by (sample_set, sample_set).
8735+ """
8736+ # NOTE for documentation of sample_sets. We *must* use samples currently because
8737+ # the normalisation for non-sample nodes is tricky. Do we normalise by the
8738+ # total span of the ts where the node is 'present' in the tree? We avoid this
8739+ # by insisting on sample nodes.
8740+
8741+ # NOTE for documentation of num_threads. Need to explain that the
8742+ # its best to think of as the number of background *worker* threads.
8743+ # default is to run without any worker threads. If you want to run
8744+ # with all the cores on the machine, use num_threads=os.cpu_count().
87568745 windows_specified = windows is not None
87578746 windows = self .parse_windows (windows )
87588747 mode = "site" if mode is None else mode
@@ -8960,37 +8949,52 @@ def genetic_relatedness_matrix(
89608949 """
89618950 Computes the full matrix of pairwise genetic relatedness values
89628951 between (and within) pairs of sets of nodes from ``sample_sets``.
8963- *Warning:* this does not compute exactly the same thing as
8952+ Returns a numpy array indexed by (window, sample_set, sample_set):
8953+ the [k,i,j]th value of the result gives the
8954+ genetic relatedness between pairs of samples from the i-th and j-th
8955+ sample sets in the k-th window.
8956+ This is (usually) more efficient than computing many pairwise
8957+ values using the `indexes` argument to :meth:`.genetic_relatedness`.
8958+ Specifically, this computes :meth:`.genetic_relatedness` with
8959+ ``centre=True`` and ``proportion=False`` (with caveats, see below).
8960+
8961+ *Warning:* in some cases, this does not compute exactly the same thing as
89648962 :meth:`.genetic_relatedness`: see below for more details.
89658963
89668964 If `mode="branch"`, then the value obtained is the same as that from
89678965 :meth:`.genetic_relatedness`, using the options `centre=True` and
89688966 `proportion=False`. The same is true if `mode="site"` and all sites have
89698967 at most one mutation.
89708968
8971- However, if some sites have more than one mutation, the value may differ.
8969+ However, if some sites have more than one mutation, the value may differ
8970+ from that given by :meth:`.genetic_relatedness`:, although if the proportion
8971+ of such sites is small, the difference will be small.
89728972 The reason is that this function (for efficiency) computes relatedness
8973- using :meth:`.divergence ` and the following relationship.
8973+ using :meth:`.divergence_matrix ` and the following relationship.
89748974 "Relatedness" measures the number of *shared* alleles (or branches),
89758975 while "divergence" measures the number of *non-shared* alleles (or branches).
89768976 Let :math:`T_i` be the total distance from sample :math:`i` up to the root;
8977- then if :math:`D_{ij}` is the divergence between :math:`i` and :math:`j`
8978- and :math:`R_{ij}` is the relatedness between :math:`i` and :math:`j`, then
8979- :math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
8977+ then if :math:`D_{ij}` is the branch-mode divergence between :math:`i` and
8978+ :math:`j` and :math:`R_{ij}` is the branch-mode relatedness between :math:`i`
8979+ and :math:`j`, then :math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
89808980 So, for any samples :math:`I`, :math:`J`, :math:`S`, :math:`T`
89818981 (that may now be random choices),
89828982 :math:`R_{IJ}-R_{IS}-R_{JT}+R_{ST} = (D_{IJ}-D_{IS}-D_{JT}+D_{ST})/ (-2)`.
8983- Note, however, that this relationship only holds for `mode="site"`
8984- if we can treat "number of differing alleles" as distances on the tree;
8985- this is not necessarily the case in the presence of multiple mutations.
8983+ This is exactly what we want for (centered) relatedness.
8984+ However, this relationship does not necessarily hold for `mode="site"`:
8985+ it does hold if we can treat "number of differing alleles" as distances
8986+ on the tree, but this is not necessarily the case in the presence of
8987+ multiple mutations.
89868988
8987- Another caveat in the above relationship between :math:`R` and :math:`D`
8989+ Another note regarding the above relationship between :math:`R` and :math:`D`
89888990 is that :meth:`.divergence` of a sample set to itself does not include
89898991 the "self" comparisons (so as to provide an unbiased estimator of a
89908992 population quantity), while the usual definition of genetic relatedness
89918993 *does* include such comparisons (to provide, for instance, an appropriate
89928994 value for prospective results beginning with only a given set of
8993- individuals).
8995+ individuals). So, diagonal entries in the relatedness matrix returned here
8996+ are obtained from :meth:`divergence_matrix` after first correcting
8997+ diagonals to include these "self" comparisons.
89948998
89958999 :param list sample_sets: A list of lists of Node IDs, specifying the
89969000 groups of nodes to compute the statistic with.
@@ -8999,11 +9003,35 @@ def genetic_relatedness_matrix(
89999003 :param str mode: A string giving the "type" of the statistic to be computed
90009004 (defaults to "site").
90019005 :param bool span_normalise: Whether to divide the result by the span of the
9002- window (defaults to True). Has no effect if ``proportion`` is True.
9003- :return: A ndarray with shape equal to (num windows, num statistics).
9004- If there is one pair of sample sets and windows=None, a numpy scalar is
9005- returned.
9006- """
9006+ window (defaults to True).
9007+ :return: An array indexed by (window, sample_set, sample_set), or if windows is
9008+ `None`, an array indexed by (sample_set, sample_set).
9009+ """
9010+ # Further notes on the relationship between relatedness (R)
9011+ # and divergence (D) in mode="site":
9012+ # The summary function for divergence is "p (1-q)",
9013+ # where p and q are the allele frequencies in the two sample sets;
9014+ # while for relatedness it is "pq". Summing across *all* alleles,
9015+ # we get that relatedness plus divergence is
9016+ # p1 (1-q1) + p1 q1 + ... + pk (1-qk) + pk qk = p1 + ... + pk = 1 .
9017+ # This implies that
9018+ # ts.divergence(..., span_normalise=False)
9019+ # + ts.genetic_relatedness(..., span_normalise=False, centre=False,
9020+ # proportion=False, polarised=False)
9021+ # == ts.num_sites
9022+ # This could be the basis for a similar relationship between R and D.
9023+ # However, that relationship holds only with polarised=False, which is not
9024+ # the default, or what this function does (for good reason).
9025+ # So, without setting polarised=False, we have that that for samples i and j,
9026+ # divergence plus relatedness is equal to (something like)
9027+ # the total number of sites at which both i and j are ancestral;
9028+ # this depends on the samples and so does not cancel out of the centred
9029+ # version. We could work through these relationships to figure out what exactly
9030+ # the difference between genetic_relatedness_matrix(mode="site") and
9031+ # genetic_relatedness(mode="site") is, in the general case of multiple
9032+ # mutations... but that would be confusing, probably not that useful,
9033+ # and the short version of all this is that "it's complicated".
9034+
90079035 D = self .divergence_matrix (
90089036 sample_sets ,
90099037 windows = windows ,
0 commit comments