tskit-dev · hyanwong · Feb 4, 2026 · Feb 6, 2026 · jeromekelleher · Feb 9, 2026
diff --git a/tests/test_variantdata.py b/tests/test_variantdata.py
@@ -1128,3 +1128,204 @@ def test_with_variant_data(self, tmp_path):
             else:
                 allele_idx = -1
             assert vdata.sites_ancestral_allele[i] == allele_idx
+
+
+class TestFromArrays:
+    def demo_data(self):
+        # returns pos, data, alleles, ancestral
+        return [
+            list(data)
+            for data in zip(
+                *[
+                    (3, [[0, 1], [0, 0], [0, 0]], ["A", "T", ""], "A"),
+                    (10, [[0, 1], [1, 1], [0, 0]], ["C", "A", ""], "C"),
+                    (13, [[0, 1], [1, 0], [0, 0]], ["G", "C", ""], "C"),
+                    (19, [[0, 0], [0, 1], [1, 0]], ["A", "C", ""], "A"),
+                    (20, [[0, 1], [2, 0], [0, 0]], ["T", "G", "C"], "T"),
+                ]
+            )
+        ]
+
+    def test_simple_from_arrays(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        vdata = tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral)
+        assert vdata.num_individuals == 3
+        assert vdata.num_sites == 5
+        inf_ts = tsinfer.infer(vdata)
+        assert inf_ts.num_samples == 6
+        assert inf_ts.num_individuals == 3
+        assert inf_ts.num_sites == 5
+        assert np.all(inf_ts.sites_position == pos)
+
+    def test_named_from_arrays(self):
+        # When we pass sample_id names, they should be stored in the individuals metadata
+        pos, G, alleles, ancestral = self.demo_data()
+        sample_id = ["sample1", "sample2", "sample3"]
+        vdata = tsinfer.VariantData.from_arrays(
+            G, pos, alleles, ancestral, sample_id=sample_id
+        )
+        assert vdata.num_individuals == 3
+        inf_ts = tsinfer.infer(vdata)
+        assert inf_ts.num_individuals == 3
+        for name, ind in zip(sample_id, inf_ts.individuals()):
+            assert ind.metadata["variant_data_sample_id"] == name
+
+    def test_bad_variant_matrix(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        G = np.array(G)
+        with pytest.raises(ValueError, match="must be a 3D array"):
+            tsinfer.VariantData.from_arrays([G], pos, alleles, ancestral)
+        with pytest.raises(ValueError, match="must be a 3D array"):
+            tsinfer.VariantData.from_arrays(G[:, :, 0], pos, alleles, ancestral)
+
+    def test_empty(self):
+        # Test with ploidy=1 but no sites
+        pos, G, alleles, ancestral = [], np.empty((0, 0, 1)), np.empty((0, 0)), []
+        with pytest.raises(ValueError, match="No sites exist"):
+            tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral)
+
+    def test_zero_ploidy(self):
+        pos, G, alleles, ancestral = [], [[[]]], np.empty((0, 0)), []
+        with pytest.raises(ValueError, match="Ploidy must be greater than zero"):
+            tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral)
+
+    def test_from_arrays_ancestral_missing_warning(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        ancestral[0] = "-"
+        with pytest.warns(UserWarning, match=r"ancestral allele.+not found[\s\S]+'-'"):
+            tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral)
+
+    def test_sequence_length(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        vdata = tsinfer.VariantData.from_arrays(
+            G, pos, alleles, ancestral, sequence_length=50
+        )
+        assert vdata.sequence_length == 50
+
+    def test_bad_sequence_length(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        with pytest.raises(ValueError, match="`sequence_length` cannot be less"):
+            tsinfer.VariantData.from_arrays(
+                G, pos, alleles, ancestral, sequence_length=10
+            )
+
+    @pytest.mark.parametrize("pos", [[[3, 10, 13, 19, 20]], [3, 10, 13, 19]])
+    def test_bad_position(self, pos):
+        _, G, alleles, ancestral = self.demo_data()
+        with pytest.raises(ValueError, match="`variant_position` must be a 1D array"):
+            tsinfer.VariantData.from_arrays(G, [pos], alleles, ancestral)
+
+    def test_unordered_position(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        pos[-1] = 5  # out of order
+        with pytest.raises(ValueError, match="out-of-order values"):
+            tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral)
+
+    def test_bad_dim_alleles(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        with pytest.raises(ValueError, match="`variant_allele` must be a 2D array"):
+            tsinfer.VariantData.from_arrays(G, pos, [alleles], ancestral)
+
+    def test_bad_alleles(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        alleles = np.array(alleles)
+        with pytest.raises(ValueError, match="same number of rows as variants"):
+            tsinfer.VariantData.from_arrays(G, pos, alleles[1:, :], ancestral)
+
+    def test_bad_num_alleles(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        alleles = np.array(alleles)
+        with pytest.raises(ValueError, match="same number of columns"):
+            tsinfer.VariantData.from_arrays(G, pos, alleles[:, 1:], ancestral)
+
+    def test_bad_ancestral_state_length(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        ancestral = np.array(ancestral)
+        with pytest.raises(ValueError, match="`ancestral_state` must be a 1D array"):
+            tsinfer.VariantData.from_arrays(G, pos, alleles, [ancestral])
+        with pytest.raises(ValueError, match="`ancestral_state` must be a 1D array"):
+            tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral[1:])
+
+    @pytest.mark.parametrize("sid", [["A"], []])
+    def test_bad_sample_id(self, sid):
+        pos, G, alleles, ancestral = self.demo_data()
+        print(sid)
+        with pytest.raises(ValueError, match="`sample_id` must be a 1D array"):
+            tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral, sample_id=sid)
+
+    def test_sample_mask(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        G = np.array(G)
+        mask = np.array([False, False, True])
+        keep = np.logical_not(mask)
+        alleles = np.array(alleles)
+        vdata = tsinfer.VariantData.from_arrays(
+            G, pos, alleles, ancestral, sample_mask=mask
+        )
+        assert vdata.num_individuals == 2
+        inf_ts = tsinfer.infer(vdata)
+        assert inf_ts.num_individuals == 2
+        for v, p, allele_arr in zip(inf_ts.variants(), pos, alleles):
+            expected_idx = G[v.site.id, keep, :].flatten()
+            assert v.site.position == p
+            assert np.array_equal(v.states(), allele_arr[expected_idx])
+
+    def test_site_mask(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        G = np.array(G)
+        mask = np.array([False, False, True, False, False])
+        keep = np.logical_not(mask)
+        pos = np.array(pos)
+        alleles = np.array(alleles)
+        ancestral = np.array(ancestral)
+        vdata = tsinfer.VariantData.from_arrays(
+            G, pos, alleles, ancestral[keep], site_mask=mask
+        )
+        assert vdata.num_individuals == 3
+        inf_ts = tsinfer.infer(vdata)
+        used_sites = np.where(keep)[0]
+        for v, p, allele_arr in zip(inf_ts.variants(), pos[keep], alleles[keep]):
+            expected_idx = G[used_sites[v.site.id], :, :].flatten()
+            assert v.site.position == p
+            assert np.array_equal(v.states(), allele_arr[expected_idx])
+
+    def test_bad_site_mask_length(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        mask = np.array([False, True, False])  # wrong length
+        with pytest.raises(ValueError, match="length as the total number of variants"):
+            tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral, site_mask=mask)
+
+    def test_bad_sample_mask_length(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        mask = np.array([False, True, True, False, True])  # wrong length
+        with pytest.raises(ValueError, match="length as the total number of samples"):
+            tsinfer.VariantData.from_arrays(
+                G, pos, alleles, ancestral, sample_mask=mask
+            )
+
+    def test_bad_ancestral_state_masked(self):
+        pos, G, alleles, ancestral = self.demo_data()
+        mask = np.array([False, False, True, False, False])
+        with pytest.raises(ValueError, match="`ancestral_state` must be a 1D array"):
+            # Need to provide ancestral states of the same length as *unmasked* sites
+            tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral, site_mask=mask)
+
+    def test_round_trip_ts(self):
+        ts = msprime.sim_ancestry(10, sequence_length=1000, random_seed=123)
+        ts = msprime.sim_mutations(ts, rate=1e-2, random_seed=123)
+        samples = ts.individuals_nodes
+        G = []
+        alleles = []
+        for v in ts.variants():
+            G.append(v.genotypes[samples])
+            alleles.append(v.alleles + ("",) * (4 - len(v.alleles)))  # pad to 4 alleles
+
+        vdata = tsinfer.VariantData.from_arrays(
+            G,
+            ts.sites_position,
+            alleles,
+            np.array(ts.sites_ancestral_state, dtype="U1"),
+        )
+        inf_ts = tsinfer.infer(vdata)
+        for v1, v2 in zip(inf_ts.variants(), ts.variants()):
+            assert np.array_equal(v1.states(), v2.states())
diff --git a/tsinfer/formats.py b/tsinfer/formats.py
@@ -2424,7 +2424,6 @@ def __init__(
         individuals_flags=None,
         sequence_length=None,
     ):
-        self._sequence_length = sequence_length
         self._contig_index = None
         self._contig_id = None
         try:
@@ -2494,6 +2493,9 @@ def process_array(
         self.individuals_select = ~sample_mask.astype(bool)
         self._num_sites = np.sum(self.sites_select)
 
+        if len(self.sites_select) == 0:
+            raise ValueError("No sites exist")
+
         if np.sum(self.sites_select) == 0:
             raise ValueError(
                 "All sites have been masked out, at least one value "
@@ -2647,6 +2649,157 @@ def process_array(
         logger.info(
             f"Number of individuals after applying mask: {self.num_individuals}"
         )
+        self._sequence_length = sequence_length
+        if self.sequence_length <= self.sites_position[-1]:
+            raise ValueError(
+                "`sequence_length` cannot be less than or equal to the maximum "
+                "unmasked variant position"
+            )
+
+    @classmethod
+    def from_arrays(
+        cls,
+        variant_matrix_phased,
+        variant_position,
+        variant_allele,
+        ancestral_state,
+        *,
+        sample_id=None,
+        site_mask=None,
+        sample_mask=None,
+        **kwargs,
+    ):
+        """
+        Create a basic in-memory VariantData instance directly from array data.
+        Mainly useful for small test datasets. Larger datasets, or ones that require
+        metadata to be included, should use e.g. bio2zarr
+        to create a zarr datastore containing the required data and call
+        VariantData(path_to_zarr)
+
+        .. note::
+            If a ``site_mask`` or ``sample_mask`` is provided, this does not
+            require changing the size of the ``variant_position``, ``variant_allele``
+            or `sample_id` arrays. Which must always match the dimensions of the
+            ``variant_matrix_phased`` array. However, if sites are masked out, this
+            *does* require changing the ``ancestral_state`` array to match the number of
+            unmasked sites (and similarly for other arrays passed as kwargs).
+
+        :param array variant_matrix_phased: a 3D array of variants X samples x ploidy,
+            giving an index into the allele array for each corresponding variant. Values
+            must be coercable into 8-bit (np.int8) integers. Data for all samples is
+            assumed to be phased. This corresponds to the ``call_genotype`` array in the
+            VCF Zarr specification (e.g. missing data can be coded as -1).
+        :param array variant_position: a 1D array of variant positions, of the same
+            length as the first dimension of ``variant_matrix_phased``.
+        :param array variant_allele: a 2D string array of variants x max_num_alleles at a
+            site. The length of the first dimension must match the first dimension of the
+            ``variant_matrix_phased`` array. The second dimension must be at least as
+            long as the maximum allele index in the `variant_matrix_phased` array (i.e.
+            each allele list for a variant must be the same length; this can be ensured
+            by padding the list with `""`).
+        :param array ancestral_state: A numpy array of strings specifying
+            the ancestral states (alleles) used in inference. For unknown ancestral
+            alleles, any character which is not in the allele list can be used.
+            This must be the same length as the number of *unmasked* variants in
+            ``variant_matrix_phased`` (see note above).
+        :param array sample_id: a 1D string array of sample names, of length of the
+            total number of n-ploid samples in ``variant_matrix_phased`` (i.e. the
+            second dimension of ``variant_matrix_phased``).
+            If None, each individual n-ploid sample will be
+            allocated an ID corresponding to its sample index in the *unmasked*
+            ``variant_matrix_phased`` array (i.e. "0", "1", "2", .. etc.)
+        :param array site_mask: A numpy array of booleans of length specifying which
+            sites to mask out (exclude) from the dataset.
+        :param array sample_mask: A numpy array of booleans of length specifying which
+            samples to mask out (exclude) from the dataset.
+        :param \\**kwargs: Further arguments passed to the VariantData constructor.
+            In particular you may wish to specify `sequence_length`. Arrays for
+            ``sites_time``, ``individuals_time`` etc. can also be provided.
+        """
+        call_genotype = np.array(variant_matrix_phased, dtype=np.int8)
+        if call_genotype.ndim != 3:
+            raise ValueError("`variant_matrix_phased` must be a 3D array")
+
+        num_variants, num_samples, ploidy = call_genotype.shape
+        if ploidy == 0:
+            raise ValueError("Ploidy must be greater than zero")
+        variant_position = np.asarray(variant_position, dtype=np.float64)
+        if variant_position.shape != (num_variants,):
+            raise ValueError(
+                "`variant_position` must be a 1D array of the same length as "
+                "the number of variants in variant_matrix_phased"
+            )
+
+        variant_allele = np.asarray(variant_allele, dtype="U")  # make unicode for zarr
+        if variant_allele.ndim != 2 or variant_allele.shape[0] != num_variants:
+            raise ValueError(
+                "`variant_allele` must be a 2D array with the same number of rows as "
+                "variants in `variant_matrix_phased`"
+            )
+
+        if sample_id is None:
+            sample_id = np.arange(call_genotype.shape[1]).astype(str)
+        sample_id = np.asarray(sample_id, dtype="U")
+        if sample_id.shape != (num_samples,):
+            raise ValueError(
+                "`sample_id` must be a 1D array of the same length as the total "
+                "number of samples in `variant_matrix_phased`"
+            )
+
+        if site_mask is None:
+            site_keep = slice(None)
+        else:
+            if site_mask.shape != (num_variants,):
+                raise ValueError(
+                    "`site_mask` must be a 1D array of the same length as the total "
+                    "number of variants in `variant_matrix_phased`"
+                )
+            site_keep = np.logical_not(site_mask)  # turn into an inclusion mask
+            num_variants = np.sum(site_keep)
+
+        if sample_mask is None:
+            sample_keep = slice(None)
+        else:
+            if sample_mask.shape != (num_samples,):
+                raise ValueError(
+                    "`sample_mask` must be a 1D array of the same length as the "
+                    "total number of samples in `variant_matrix_phased`"
+                )
+            sample_keep = np.logical_not(sample_mask)  # turn into an inclusion mask
+            num_samples = np.sum(sample_keep)
+
+        # Further tests must take into account the masked num_samples & num_variants
+
+        max_alleles = np.max(call_genotype[site_keep, sample_keep, :], initial=-1) + 1
+        if variant_allele.shape[1] < max_alleles:
+            raise ValueError(
+                "`variant_allele` must have the same number of columns as the maximum "
+                "value in the unmasked variant_matrix_phased plus one"
+            )
+
+        ancestral_state = np.asarray(ancestral_state, dtype="U")
+        if ancestral_state.shape != (num_variants,):
+            raise ValueError(
+                "`ancestral_state` must be a 1D array of the same length as the "
+                "number of unmasked variants in `variant_matrix_phased`"
+            )
+
+        store = zarr.storage.MemoryStore()
+        root = zarr.group(store=store, overwrite=True)
+        root.create_dataset("variant_position", data=variant_position)
+        root.create_dataset("call_genotype", data=call_genotype)
+        root.create_dataset(  # Assume all phased
+            "call_genotype_phased", data=np.ones(call_genotype.shape[:2], dtype=bool)
+        )
+        root.create_dataset("variant_allele", data=variant_allele)
+        root.create_dataset("sample_id", data=sample_id)
+        return cls(
+            root,
+            ancestral_state,
+            site_mask=site_mask,
+            sample_mask=sample_mask,
+            **kwargs,
+        )
 
     @functools.cached_property
     def format_name(self):