Add rightmost column in backward matrix, update conversion function, and add calculation notes

szhan · szhan · commit 7564875d25e5 · 2023-08-07T22:41:11.000+01:00
diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py
@@ -1,5 +1,6 @@
 from io import StringIO
 
+import numpy as np
 import pandas as pd
 
 import tskit
@@ -93,8 +94,27 @@ def get_toy_ts():
 """
 BEAGLE 4.1 was run on the toy data set above using default parameters.
 The following are the forward probability matrices and backward probability
-matrices calculated when imputing into the third individual above.
-Note that there are two sets of matrices, one for each haplotype.
+matrices calculated when imputing into the third individual above. There are
+two sets of matrices, one for each haplotype.
+
+Notes about calculations:
+n = number of haplotypes in ref. panel
+M = number of markers
+m = index of marker (site)
+h = index of haplotype in ref. panel
+
+In forward probability matrix,
+    fwd[m][h] = emission prob., if m = 0 (first marker)
+    fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise
+    where scale = (1 - switch prob.)/sum of fwd[m - 1],
+        and shift = switch prob./n.
+
+In backward probability matrix,
+    bwd[m][h] = 1, if m = M - 1 (last marker)   // DON'T SEE THIS IN BEAGLE
+    unadj. bwd[m][h] = emission prob. / n
+    bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise
+    where scale = (1 - switch prob.)/sum of unadj. bwd[m],
+        and shift = switch prob./n.
 """
 _fwd_matrix_text_1 = """
 m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shift,scale,sum,val
@@ -118,6 +138,10 @@ def get_toy_ts():
 
 _bwd_matrix_text_1 = """
 m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shift,scale,sum,val
+3,0,NA,NA,NA,NA,NA,NA,NA,NA,NA,1.000000
+3,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,1.000000
+3,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,1.000000
+3,3,NA,NA,NA,NA,NA,NA,NA,NA,NA,1.000000
 2,0,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.250050,0.250000
 2,1,1.000000,0.000000,0.999900,0.000100,0,0,0.000000,0.250000,0.250050,0.250000
 2,2,1.000000,0.000000,0.999900,0.000100,1,0,0.000000,0.250000,0.250050,0.250000
@@ -154,6 +178,10 @@ def get_toy_ts():
 
 _bwd_matrix_text_2 = """
 m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shift,scale,sum,val
+3,0,NA,NA,NA,NA,NA,NA,NA,NA,NA,1.000000
+3,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,1.000000
+3,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,1.000000
+3,3,NA,NA,NA,NA,NA,NA,NA,NA,NA,1.000000
 2,0,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.749950,0.250000
 2,1,1.000000,0.000000,0.999900,0.000100,0,1,0.000000,0.250000,0.749950,0.250000
 2,2,1.000000,0.000000,0.999900,0.000100,1,1,0.000000,0.250000,0.749950,0.250000
@@ -174,10 +202,11 @@ def convert_to_pd_df(matrix_text):
     Converts a matrix in text to a Pandas dataframe and returns it.
     """
     df = pd.read_csv(StringIO(matrix_text))
-    # Check that switch and non-switch probabilities sum to 1
-    assert all(df.probRec + df.probNoRec == 1)
-    # Check that non-mismatch and mismatch probabilities sum to 1
-    assert all(df.noErrProb + df.errProb == 1)
+    for i in np.arange(df.shape[0]):
+        # Check that switch and non-switch probabilities sum to 1
+        assert df.probRec[i] + df.probNoRec[i] == 1 or np.isnan(df.probRec[i])
+        # Check that non-mismatch and mismatch probabilities sum to 1
+        assert df.noErrProb[i] + df.errProb[i] == 1 or np.isnan(df.noErrProb[i])
     matrix = df.val.to_numpy().reshape(
         (
             4,