Load tree sequence from text

szhan · szhan · commit f8e42535b437 · 2023-08-07T19:14:25.000+01:00
diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py
@@ -2,64 +2,94 @@
 
 import pandas as pd
 
+import tskit
+
 
 """
 A tree sequence containing 3 diploid individuals with 5 sites and 5 mutations
 (one per site). The first 2 individuals are used as reference panel,
 the last one is the target individual.
 """
-_toy_ts_text = """
-    left    right   parent  child   metadata
-    0.000000        1000000.000000  6       0
-    0.000000        1000000.000000  6       3
-    0.000000        1000000.000000  7       2
-    0.000000        1000000.000000  7       5
-    0.000000        1000000.000000  8       1
-    0.000000        1000000.000000  8       4
-    0.000000        781157.000000   9       6
-    0.000000        781157.000000   9       7
-    0.000000        505438.000000   10      8
-    0.000000        505438.000000   10      9
-    505438.000000   549484.000000   11      8
-    505438.000000   549484.000000   11      9
-    781157.000000   1000000.000000  12      6
-    781157.000000   1000000.000000  12      7
-    549484.000000   1000000.000000  13      8
-    549484.000000   781157.000000   13      9
-    781157.000000   1000000.000000  13      12
-    id      flags   location        parents metadata
-    0       0
-    1       0
-    2       0
-    site    node    time    derived_state   parent  metadata
-    0       9       unknown G       -1
-    1       8       unknown A       -1
-    2       9       unknown T       -1
-    3       9       unknown C       -1
-    4       12      unknown C       -1
-    id      is_sample       time    population      individual      metadata
-    0       1       0.000000        0       0
-    1       1       0.000000        0       0
-    2       1       0.000000        0       1
-    3       1       0.000000        0       1
-    4       1       0.000000        0       2
-    5       1       0.000000        0       2
-    6       0       0.029768        0       -1
-    7       0       0.133017        0       -1
-    8       0       0.223233        0       -1
-    9       0       0.651586        0       -1
-    10      0       0.698831        0       -1
-    11      0       2.114867        0       -1
-    12      0       4.322031        0       -1
-    13      0       7.432311        0       -1
-    position        ancestral_state metadata
-    200000.000000   A
-    300000.000000   C
-    520000.000000   G
-    600000.000000   T
-    900000.000000   A
+_toy_ts_nodes_text = """\
+id      is_sample       time    population      individual      metadata
+0       1       0.000000        0       0
+1       1       0.000000        0       0
+2       1       0.000000        0       1
+3       1       0.000000        0       1
+4       1       0.000000        0       2
+5       1       0.000000        0       2
+6       0       0.029768        0       -1
+7       0       0.133017        0       -1
+8       0       0.223233        0       -1
+9       0       0.651586        0       -1
+10      0       0.698831        0       -1
+11      0       2.114867        0       -1
+12      0       4.322031        0       -1
+13      0       7.432311        0       -1
+"""
+
+_toy_ts_edges_text = """\
+left    right   parent  child   metadata
+0.000000        1000000.000000  6       0
+0.000000        1000000.000000  6       3
+0.000000        1000000.000000  7       2
+0.000000        1000000.000000  7       5
+0.000000        1000000.000000  8       1
+0.000000        1000000.000000  8       4
+0.000000        781157.000000   9       6
+0.000000        781157.000000   9       7
+0.000000        505438.000000   10      8
+0.000000        505438.000000   10      9
+505438.000000   549484.000000   11      8
+505438.000000   549484.000000   11      9
+781157.000000   1000000.000000  12      6
+781157.000000   1000000.000000  12      7
+549484.000000   1000000.000000  13      8
+549484.000000   781157.000000   13      9
+781157.000000   1000000.000000  13      12
+"""
+
+_toy_ts_sites_text = """\
+position        ancestral_state metadata
+200000.000000   A
+300000.000000   C
+520000.000000   G
+600000.000000   T
+900000.000000   A
+"""
+
+_toy_ts_mutations_text = """\
+site    node    time    derived_state   parent  metadata
+0       9       unknown G       -1
+1       8       unknown A       -1
+2       9       unknown T       -1
+3       9       unknown C       -1
+4       12      unknown C       -1
+"""
+
+_toy_ts_individuals_text = """\
+flags
+0
+0
+0
 """
 
+
+def get_toy_ts():
+    """
+    Returns the toy tree sequence in text format above.
+    """
+    ts = tskit.load_text(
+        nodes=StringIO(_toy_ts_nodes_text),
+        edges=StringIO(_toy_ts_edges_text),
+        sites=StringIO(_toy_ts_sites_text),
+        mutations=StringIO(_toy_ts_mutations_text),
+        individuals=StringIO(_toy_ts_individuals_text),
+        strict=False,
+    )
+    return ts
+
+
 """
 BEAGLE 4.1 was run on the toy data set above using default parameters.
 The following are the forward probability matrices and backward probability
@@ -141,7 +171,7 @@
 
 def convert_to_pd_df(matrix_text):
     """
-    Convert a matrix in text to a Pandas dataframe.
+    Converts a matrix in text to a Pandas dataframe and returns it.
     """
     df = pd.read_csv(StringIO(matrix_text))
     # Check that switch and non-switch probabilities sum to 1